Utente:Leonard Vertighel/temp
Unfinished anchor link check script.
Released under GPL.
(Contains code from MediaWiki 1.5)
BUG: Add boilerplate text... ;)
Tables: create table if not exists sl_links ( sll_from varchar(255), sll_to varchar(255), sll_sec varchar(255) ); create table if not exists sl_targets ( slt_title varchar(255), slt_sec varchar(255), index(slt_title, slt_sec) ); <?php //config.php $config['mysql_host'] = '127.0.0.1'; $config['mysql_user'] = 'root'; $config['mysql_pass'] = ''; $config['mysql_db'] = 'wikipedia'; ?> <?php //functions.php // database function db_connect($new = false) { global $config; $db = mysql_connect($config['mysql_host'], $config['mysql_user'], $config['mysql_pass'], $new); mysql_select_db($config['mysql_db'], $db); return $db; } // functions from MediaWiki 1.5 function MW_decodeCharReferences( $text ) { return preg_replace_callback( '/&([A-Za-z0-9]+);|&\#([0-9]+);|&\#x([0-9A-Za-z]+);|&\#X([0-9A-Za-z]+);|(&)/x', 'MW_decodeCharReferencesCallback', $text ); } function MW_decodeCharReferencesCallback( $matches ) { if( $matches[1] != '' ) { return MW_decodeEntity( $matches[1] ); } elseif( $matches[2] != '' ) { return MW_decodeChar( intval( $matches[2] ) ); } elseif( $matches[3] != '' ) { return MW_decodeChar( hexdec( $matches[3] ) ); } elseif( $matches[4] != '' ) { return MW_decodeChar( hexdec( $matches[4] ) ); } # Last case should be an ampersand by itself return $matches[0]; } function MW_decodeEntity( $name ) { $wgHtmlEntities = array( 'Aacute' => 193, 'aacute' => 225, 'Acirc' => 194, 'acirc' => 226, 'acute' => 180, 'AElig' => 198, 'aelig' => 230, 'Agrave' => 192, 'agrave' => 224, 'alefsym' => 8501, 'Alpha' => 913, 'alpha' => 945, 'amp' => 38, 'and' => 8743, 'ang' => 8736, 'Aring' => 197, 'aring' => 229, 'asymp' => 8776, 'Atilde' => 195, 'atilde' => 227, 'Auml' => 196, 'auml' => 228, 'bdquo' => 8222, 'Beta' => 914, 'beta' => 946, 'brvbar' => 166, 'bull' => 8226, 'cap' => 8745, 'Ccedil' => 199, 'ccedil' => 231, 'cedil' => 184, 'cent' => 162, 'Chi' => 935, 'chi' => 967, 'circ' => 710, 'clubs' => 9827, 'cong' => 8773, 'copy' => 169, 'crarr' => 8629, 'cup' => 8746, 'curren' => 164, 'dagger' => 8224, 'Dagger' => 8225, 'darr' => 8595, 'dArr' => 8659, 'deg' => 176, 'Delta' => 916, 'delta' => 948, 'diams' => 9830, 'divide' => 247, 'Eacute' => 201, 'eacute' => 233, 'Ecirc' => 202, 'ecirc' => 234, 'Egrave' => 200, 'egrave' => 232, 'empty' => 8709, 'emsp' => 8195, 'ensp' => 8194, 'Epsilon' => 917, 'epsilon' => 949, 'equiv' => 8801, 'Eta' => 919, 'eta' => 951, 'ETH' => 208, 'eth' => 240, 'Euml' => 203, 'euml' => 235, 'euro' => 8364, 'exist' => 8707, 'fnof' => 402, 'forall' => 8704, 'frac12' => 189, 'frac14' => 188, 'frac34' => 190, 'frasl' => 8260, 'Gamma' => 915, 'gamma' => 947, 'ge' => 8805, 'gt' => 62, 'harr' => 8596, 'hArr' => 8660, 'hearts' => 9829, 'hellip' => 8230, 'Iacute' => 205, 'iacute' => 237, 'Icirc' => 206, 'icirc' => 238, 'iexcl' => 161, 'Igrave' => 204, 'igrave' => 236, 'image' => 8465, 'infin' => 8734, 'int' => 8747, 'Iota' => 921, 'iota' => 953, 'iquest' => 191, 'isin' => 8712, 'Iuml' => 207, 'iuml' => 239, 'Kappa' => 922, 'kappa' => 954, 'Lambda' => 923, 'lambda' => 955, 'lang' => 9001, 'laquo' => 171, 'larr' => 8592, 'lArr' => 8656, 'lceil' => 8968, 'ldquo' => 8220, 'le' => 8804, 'lfloor' => 8970, 'lowast' => 8727, 'loz' => 9674, 'lrm' => 8206, 'lsaquo' => 8249, 'lsquo' => 8216, 'lt' => 60, 'macr' => 175, 'mdash' => 8212, 'micro' => 181, 'middot' => 183, 'minus' => 8722, 'Mu' => 924, 'mu' => 956, 'nabla' => 8711, 'nbsp' => 160, 'ndash' => 8211, 'ne' => 8800, 'ni' => 8715, 'not' => 172, 'notin' => 8713, 'nsub' => 8836, 'Ntilde' => 209, 'ntilde' => 241, 'Nu' => 925, 'nu' => 957, 'Oacute' => 211, 'oacute' => 243, 'Ocirc' => 212, 'ocirc' => 244, 'OElig' => 338, 'oelig' => 339, 'Ograve' => 210, 'ograve' => 242, 'oline' => 8254, 'Omega' => 937, 'omega' => 969, 'Omicron' => 927, 'omicron' => 959, 'oplus' => 8853, 'or' => 8744, 'ordf' => 170, 'ordm' => 186, 'Oslash' => 216, 'oslash' => 248, 'Otilde' => 213, 'otilde' => 245, 'otimes' => 8855, 'Ouml' => 214, 'ouml' => 246, 'para' => 182, 'part' => 8706, 'permil' => 8240, 'perp' => 8869, 'Phi' => 934, 'phi' => 966, 'Pi' => 928, 'pi' => 960, 'piv' => 982, 'plusmn' => 177, 'pound' => 163, 'prime' => 8242, 'Prime' => 8243, 'prod' => 8719, 'prop' => 8733, 'Psi' => 936, 'psi' => 968, 'quot' => 34, 'radic' => 8730, 'rang' => 9002, 'raquo' => 187, 'rarr' => 8594, 'rArr' => 8658, 'rceil' => 8969, 'rdquo' => 8221, 'real' => 8476, 'reg' => 174, 'rfloor' => 8971, 'Rho' => 929, 'rho' => 961, 'rlm' => 8207, 'rsaquo' => 8250, 'rsquo' => 8217, 'sbquo' => 8218, 'Scaron' => 352, 'scaron' => 353, 'sdot' => 8901, 'sect' => 167, 'shy' => 173, 'Sigma' => 931, 'sigma' => 963, 'sigmaf' => 962, 'sim' => 8764, 'spades' => 9824, 'sub' => 8834, 'sube' => 8838, 'sum' => 8721, 'sup' => 8835, 'sup1' => 185, 'sup2' => 178, 'sup3' => 179, 'supe' => 8839, 'szlig' => 223, 'Tau' => 932, 'tau' => 964, 'there4' => 8756, 'Theta' => 920, 'theta' => 952, 'thetasym' => 977, 'thinsp' => 8201, 'THORN' => 222, 'thorn' => 254, 'tilde' => 732, 'times' => 215, 'trade' => 8482, 'Uacute' => 218, 'uacute' => 250, 'uarr' => 8593, 'uArr' => 8657, 'Ucirc' => 219, 'ucirc' => 251, 'Ugrave' => 217, 'ugrave' => 249, 'uml' => 168, 'upsih' => 978, 'Upsilon' => 933, 'upsilon' => 965, 'Uuml' => 220, 'uuml' => 252, 'weierp' => 8472, 'Xi' => 926, 'xi' => 958, 'Yacute' => 221, 'yacute' => 253, 'yen' => 165, 'Yuml' => 376, 'yuml' => 255, 'Zeta' => 918, 'zeta' => 950, 'zwj' => 8205, 'zwnj' => 8204 ); if( isset( $wgHtmlEntities[$name] ) ) { return MW_codepointToUtf8( $wgHtmlEntities[$name] ); } else { return "&$name;"; } } function MW_decodeChar( $codepoint ) { if( MW_validateCodepoint( $codepoint ) ) { return MW_codepointToUtf8( $codepoint ); } else { return ''; # ok forget about UTF8_REPLACEMENT; for now } } function MW_validateCodepoint( $codepoint ) { return ($codepoint == 0x09) || ($codepoint == 0x0a) || ($codepoint == 0x0d) || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); } function MW_codepointToUtf8( $codepoint ) { if($codepoint < 0x80) return chr($codepoint); if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . chr($codepoint & 0x3f | 0x80); if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . chr($codepoint >> 6 & 0x3f | 0x80) . chr($codepoint & 0x3f | 0x80); if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) . chr($codepoint >> 12 & 0x3f | 0x80) . chr($codepoint >> 6 & 0x3f | 0x80) . chr($codepoint & 0x3f | 0x80); die("Asked for code outside of range ($codepoint)\n"); } // end functions from MediaWiki ?> <?php //Warning: truncate the sl_* tables first //BUG: will output mysql error if query has no values //BUG: urlencode()d links should be decoded //BUG: nedd to take multiple equal section titles into account //NOTE: Redirs are excluded in theory, but some are not marked as such in dump // (might be useful to take them into account as well, // so they will be correct if they will be supported in future) //We are not taking into account: //links in templates //sections in templates //templates in section titles //Other problems: //Markup in section titles, in particular math //(simple markup like links, italics etc. is ok) require_once('config.php'); require_once('functions.php'); $dbr = db_connect(); $dbw = db_connect(true); // for now, we are just concerned with main namespace, // so we will ignore all links with namespace or interwiki prefix // prefixes are NOT case sensitive $prefixes = array( 'media', 'speciale', 'special', 'discussione', 'talk', 'utente', 'user', 'discussioni_utente', 'user_talk', 'wikipedia', 'discussioni_wikipedia', 'wikipedia_talk', 'immagine', 'image', 'discussioni_immagine', 'image_talk', 'mediawiki', 'discussioni_mediawiki', 'mediawiki_talk', 'template', 'discussioni_template', 'template_talk', 'aiuto', 'help', 'discussioni_aiuto', 'help_talk', 'categoria', 'category', 'discussioni_categoria', 'category_talk', 'portale', 'portal', 'discussioni_portale', 'portal_talk'); $query = 'select iw_prefix from interwiki'; $result = mysql_query ($query, $dbr); while ( $row = mysql_fetch_array($result) ) { $prefixes[] = $row['iw_prefix']; } echo count($prefixes), " prefixes found.\n"; echo "Retrieving pages from db...\n"; $query = 'select page_title, old_text from page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 0'; $result = mysql_unbuffered_query($query, $dbr); $s_l = ' '; $s_t = ' '; $query_l = "insert into sl_links (sll_from, sll_to, sll_sec) values"; $query_t = "insert into sl_targets (slt_title, slt_sec) values"; $counter = 0; echo "Starting section title/anchor and section link extraction...\n"; while ( $row = mysql_fetch_assoc($result) ) { $title = mysql_escape_string($row['page_title']); // remove html comments (what about nested comments?) // decode references now to avoid trouble with &#num; entities $tmp = explode('<!--', MW_decodeCharReferences($row['old_text'])); $text = $tmp[0]; unset($tmp[0]); foreach ( $tmp as $tmp2 ) { list($dummy, $tmp3) = explode('-->', $tmp2, 2); $text .= $tmp3; } // sections (may occur at very beginning of page!) preg_match_all('/(^|\n)={2,6}([^=\n]([^\n]*[^=\n])?)={2,6}/u', $text, $matches, PREG_SET_ORDER); foreach ( $matches as $match ) { // do NOT collapse whitespace (mimick behaviour of MediaWiki 1.6alpha) $sec = strtr(trim($match[2]), ' ', '_'); // quick and dirty cleanup of most frequent wiki-markup etc. // will produce incorrect results with math, templates, and some other cases $sec = preg_replace("/'{2,3}/u", '', $sec); // italics, bold (might kill some apostrophe) $sec = preg_replace('/\[\[[^]|\n]+\|([^]\n]+)\]\]/u', '$1', $sec); // wikilink w/ text $sec = preg_replace('/\[\[|\]\]/u', '', $sec); // any further double square brackets (even non-matched) $sec = preg_replace('|<[^>\n]+>|u', '', $sec); // anything that vaguely resembles an HTML tag // this should be more or less the anchor name: // (note that we cannot recover original text from this) $sec = strtr(urlencode($sec), '%', '.'); $query_t .= $s_t . "('$title', '$sec')"; $s_t = ', '; } // anchors ('id' attributes of html elements - 'name' seems not accepted as of MW1.6alpha) // (should probably be restricted to allowed elements) // (maybe should check for uniqueness?) // very rough: preg_match_all('/<[a-z]+[^<>\n]*[\s^\n]+id[\s^\n]*=[\s^\n]*([^<>\n]*)>/u', $text, $matches, PREG_SET_ORDER); foreach ( $matches as $match ) { $tmp = $match[1]; $anc = ''; if ( strpos($tmp, '"') === 0 ) { $tmp = mb_substr($tmp, 1); if ( strpos($tmp, '"') ) { list($anc, $dummy) = explode('"', $tmp, 2); } } elseif ( strpos($tmp, "'") === 0 ) { $tmp = mb_substr($tmp, 1); if ( strpos($tmp, "'") ) { list($anc, $dummy) = explode("'", $tmp, 2); } } else { preg_match('/^[A-Za-z0-9]+/u', $tmp, $a_match); $anc = $a_match[0]; } // do NOT collapse NOR trim whitespace (mimick behaviour of MediaWiki 1.6alpha) $anc = strtr($anc, ' ', '_'); // this should be more or less the anchor name: // (note that we cannot recover original text from this) $anc = strtr(urlencode($anc), '%', '.'); if ( $anc ) { $query_t .= $s_t . "('$title', '$anc')"; $s_t = ', '; } } // links to sections (note that prefix must NOT contain any ':') preg_match_all('/\[\[:?(([^]|\n:]+):)?([^]|\n]*)#([^]|\n]+)(\||\]\])/u', $text, $matches, PREG_SET_ORDER); foreach ( $matches as $match ) { // we are concerned only with internal links to main namespace: $l_prefix = strtolower(preg_replace('/[\s_]+/u', '_', trim($match[2]))); if ( in_array($l_prefix, $prefixes) ) { continue; } // not a valid prefix, so add it back to title: $tmp = preg_replace('/[\s_]+/u', '_', trim($match[1] . $match[3])); // collapse whitespace and _ $l_title = mysql_escape_string(mb_strtoupper(mb_substr($tmp, 0, 1)) . mb_substr($tmp, 1)); if ( $l_title == '' ) { $l_title = $title; } $l_sec = preg_replace('/[\s_]+/u', '_', trim($match[4])); // collapse whitespace and _ // this is just a guess, MediaWiki source should be analyzed to confirm/correct if ( $l_sec != urldecode($l_sec) ) { $l_sec = strtr($l_sec, '%', '.'); } else { $l_sec = strtr(urlencode($l_sec), '%', '.'); } $query_l .= $s_l . "('$title', '$l_title', '$l_sec')"; $s_l = ', '; } $counter++; if ( ($counter%1000) == 0 ) { echo $counter, "\n"; mysql_query($query_l, $dbw); if ( mysql_errno($dbw) ) { echo mysql_error($dbw), "\n"; } mysql_query($query_t, $dbw); if ( mysql_errno($dbw) ) { echo mysql_error($dbw), "\n"; } $s_l = ' '; $s_t = ' '; $query_l = "insert into sl_links (sll_from, sll_to, sll_sec) values"; $query_t = "insert into sl_targets (slt_title, slt_sec) values"; } } // leftover query (not an elegant solution...) mysql_query($query_l, $dbw); if ( mysql_errno($dbw) ) { echo mysql_error($dbw), "\n"; } mysql_query($query_t, $dbw); if ( mysql_errno($dbw) ) { echo mysql_error($dbw), "\n"; } // and now for the redirects echo "Loading redirects from db...\n"; $query = 'select page_title, old_text from page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 1'; $result = mysql_unbuffered_query($query, $dbr); $counter = 0; echo "Creating redirect targets...\n"; while ( $row = mysql_fetch_assoc($result) ) { $title = mysql_escape_string($row['page_title']); $text = MW_decodeCharReferences($row['old_text']); // we use essentially the same regexp as above, //since prefixes and anchors etc. might exist here as well //(but anchor is optional (and meaningless as of MW1.6alpha)) preg_match('/#\s*redirect\s*\[\[:?(([^]|\n:]+):)?([^]|\n]*)(#([^]|\n]+))?(\||\]\])/ui', $text, $match); // we are concerned only with internal links to main namespace: $l_prefix = strtolower(preg_replace('/[\s_]+/u', '_', trim($match[2]))); if ( in_array($l_prefix, $prefixes) ) { continue; } // not a valid prefix, so add it back to title: $tmp = preg_replace('/[\s_]+/u', '_', trim($match[1] . $match[3])); // collapse whitespace and _ $l_title = mysql_escape_string(mb_strtoupper(mb_substr($tmp, 0, 1)) . mb_substr($tmp, 1)); $query = "insert into sl_targets (slt_title, slt_sec) select '$title', slt_sec from sl_targets where slt_title = '$l_title'"; mysql_query($query, $dbw); if ( mysql_errno($dbw) ) { echo mysql_error($dbw), "\n"; } $counter++; if ( ($counter%1000) == 0 ) { echo $counter, "\n"; } } ?> Queries: General list: Page-containing-link: link (number of occurrences in page) select concat('#[[', sll_from, ']]: [[', sll_to, '#', sll_sec, ']] (', count(*), ')') from sl_links left join sl_targets on sll_to = slt_title and sll_sec = slt_sec where slt_sec is null group by sll_from, sll_to, sll_sec; Star Wars planets (and analogous cases): select concat('#[[Pianeti_di_Guerre_Stellari#', sll_sec, ']]: ', group_concat(concat('[[', sll_from, ']]') separator ', ')) from sl_links left join sl_targets on sll_to = slt_title and sll_sec = slt_sec where sll_to = 'Pianeti_di_Guerre_Stellari' and slt_sec is null group by sll_sec;