* Export a part of one wiki with rewriting names of page names/links * Useful for merging wikis * Based on dumpBackup.php and SpecialExport.php * */ /* TODO: For extracting part of a wiki, would be useful to get a list of images used and so can copy only those images */ $originalDir = getcwd(); $optionsWithArgs = array( 'server', 'pagelist', 'filter', 'rewrite' ); require_once( 'commandLine.inc' ); require_once( 'SpecialExport.php' ); class PageRewriteRules { var $file = null; var $dumpNamespaces = array(); var $nsRules = array(); var $nsMapping = array(); var $extraSelectFilter = ""; var $generalRewriteRules = array(); var $contributorRewriteRules = array(); function PageRewriteRules($lines) { foreach ($lines as $line) { if (preg_match("/^d (.*)$/", $line, $m)) { $nss = preg_split("/\s*,\s*/", trim($m[1])); foreach ($nss as $ns) { if (is_numeric($ns)) $this->dumpNamespaces[] = $ns; else { print "cannot dump '$ns'. expecting namespace numbers"; exit; } } } else if (preg_match("/^s (.*)$/", $line, $m)) { $this->extraSelectFilter = $m[1]; } else if (preg_match("/^\\*\s+(.*)\s*=>\s*(.*)$/", $line, $m)) { $this->generalRewriteRules[] = array($m[1], $m[2]); } else if (preg_match("/^u\s+(.*)\s*=>\s*([0-9]+),(.*)$/",$line,$m)) {//Match the format: u => , $this->contributorRewriteRules[] = array(trim($m[1]), trim($m[2]),trim($m[3])); } else if (preg_match("/^([0-9]+)/", $line)) { $curNs = array(); $nss = preg_split("/\s*,\s*/", trim($line)); foreach ($nss as $ns) { if (preg_match("/^([0-9]+)\s*(=>\s*([0-9]+))?/", $ns, $m)) { if (isset($m[2])) { $this->nsMapping[$m[1]] = $m[3]; } $curNs[] = $m[1]; } else { print "$ns not understood, expecting namespace numbers\n"; exit; } } } else if (preg_match("/^\s+(.*)\s*=>\s*(.*)$/", $line, $m)) { if (! $curNs) { print "$line occurred before namespace definition\n"; exit; } foreach ($curNs as $ns) { $this->nsRules[$ns][] = array($m[1], $m[2]); } } // everything else is taken as a comment } } function pageFilter() { // returns SQL WHERE limitation to select pages from namespaces we want only $filter = ""; foreach ($this->dumpNamespaces as $ns) { if ($filter != "") $filter .= " OR "; $filter .= " page_namespace=" . $ns; } if ($this->extraSelectFilter) $filter = "( $filter ) AND ( $this->extraSelectFilter )" ; return "($filter)"; } function rewriteTitle(&$namespace, &$title) { $rules = $this->nsRules[$namespace]; if (isset($this->nsMapping[$namespace])) $namespace = $this->nsMapping[$namespace]; if (!$rules) return; foreach ($rules as $rule) { if (preg_match($rule[0], $title)) { $title = preg_replace($rule[0], $rule[1], $title); return; } } } function rewriteUser($userid, $username) { foreach ($this->contributorRewriteRules as $rule) { if(strcasecmp($username,$rule[0])==0) { $userid = $rule[1]; $username = $rule[2]; } } return array($userid,$username); } function rewriteSubpage($srcTitle, &$linkNs, &$linkTitle) { $srcNs = $srcTitle->getNamespace(); $srcPage = $srcTitle->getText(); // possibly rewrite the link title using a subpage relative link if ($srcNs != $linkNs) return; $explodeSrc = explode( '/', $srcPage); $explodeLink = explode( '/', $linkTitle); // find first difference between the components for ($i = 0; $i < count($explodeSrc) && $i < count($explodeLink) && $explodeSrc[$i] == $explodeLink[$i]; $i++) ; // empty if ($i == 0) return; // no common components, not subpage $ret = ""; for ($j = $i; $j < count($explodeSrc); $j++) { $ret .= "../"; } if (!$ret) $ret = "/"; for ($j = $i; $j < count($explodeLink); $j++) { // we want a terminating "/" to not display initial / $ret .= $explodeLink[$j] . "/"; } $linkTitle = $ret; } function rewriteText($srcNs, $srcPage, $newTitle, $text) { // rewrite links in $text located on the original page $srcNs:$srcPage // which is in process of being moved to $newTitle global $wgContLang; // Get internal wiki links and do replacements to match title replacements // some of this from Parser.php/replaceInternalLinks // this doesn't do real article parsing // for example if [[link]] is in comment/nowiki tag, it'll still be replaced static $tc = FALSE; // the % is needed to support urlencoded titles as well if ( !$tc ) { $tc = Title::legalChars() . '#%'; } static $pat = FALSE; if ( !$pat ) { $pat = "/(\\[\\[[{$tc}]+)(\\||]])/s"; } $a = preg_split($pat, $text, -1, PREG_SPLIT_DELIM_CAPTURE); static $parser = FALSE; if (!$parser) { $parser = new Parser(); } $parser->mTitle = Title::makeTitle($srcNs, $srcPage); foreach ($a as &$elem) { if (preg_match("/^\\[\\[(([^:]*):?(.*))$/s", $elem, $m)) { $ns = 0; $page = $m[1]; // expand any subpage links $page = $parser->maybeDoSubpageLink($page, $tmp); if (isset($m[2])) { // don't rewrite interwiki links if (Title::getInterwikiLink($m[2])) continue; $ns = $wgContLang->getNsIndex($m[2]); if (is_numeric($ns)) $page = $m[3]; else $ns = 0; } // force first character of page name to be uppercase // by default, on source wiki is treated as uppercase in this namespace, // but after rewriting, may be no longer be treated as uppercase // leading to the incorrect page being referred to $page = ucfirst($page); $this->rewriteTitle($ns, $page); $this->rewriteSubpage($newTitle, $ns, $page); $title = Title::makeTitle($ns, $page); $elem = "[[" . $title->getPrefixedText(); } } $text = implode('', $a); # handle general rewrite rules foreach ($this->generalRewriteRules as $rule) { $text = preg_replace($rule[0], $rule[1], $text); } return $text; } } class WikiMerger extends WikiExporter { var $rewriter = null; var $lastTitle = null; function getNewUser($row) { //Wrapper for rewriter class $result=$this->rewriter->rewriteUser($row->rev_user,$row->rev_user_text); $row->rev_user=$result[0]; $row->rev_user_text=$result[1]; //return row; } function openPage( $row ) { print "\n"; $ns = $row->page_namespace; $title = $row->page_title; $this->rewriter->rewriteTitle($ns, $title); $title = Title::makeTitle($ns, $title); $this->lastTitle = $title; print ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n"; print ' ' . wfElement( 'id', array(), $row->page_id ) . "\n"; if( '' != $row->page_restrictions ) { print ' ' . wfElement( 'restrictions', array(), $row->page_restrictions ) . "\n"; } } function dumpRev( $row ) { $fname = 'WikiMerger::dumpRev'; wfProfileIn( $fname ); print " \n"; print " " . wfElement( 'id', null, $row->rev_id ) . "\n"; $ts = wfTimestamp2ISO8601( $row->rev_timestamp ); print " " . wfElement( 'timestamp', null, $ts ) . "\n"; print " "; $this->getNewUser($row); if( $row->rev_user ) { print wfElementClean( 'username', null, $row->rev_user_text ); print wfElement( 'id', null, $row->rev_user ); } else { print wfElementClean( 'ip', null, $row->rev_user_text ); } print "\n"; if( $row->rev_minor_edit ) { print " \n"; } if( $row->rev_comment != '' ) { print " " . wfElementClean( 'comment', null, $row->rev_comment ) . "\n"; } $text = Revision::getRevisionText( $row ); $text = $this->rewriter->rewriteText($row->page_namespace, $row->page_title, $this->lastTitle, $text); print " " . wfElementClean( 'text', array( 'xml:space' => 'preserve' ), $text ) . "\n"; print " \n"; wfProfileOut( $fname ); if( isset( $this->revCallback ) ) { call_user_func( $this->revCallback, $row ); } } } class BackupDumper { var $reportingInterval = 100; var $reporting = true; var $pageCount = 0; var $revCount = 0; var $server = null; // use default var $rewriter = null; function BackupDumper() { $this->stderr = fopen( "php://stderr", "wt" ); } function dump( $history ) { # This shouldn't happen if on console... ;) header( 'Content-type: text/html; charset=UTF-8' ); # Notice messages will foul up your XML output even if they're # relatively harmless. ini_set( 'display_errors', false ); $this->startTime = wfTime(); $dbr =& wfGetDB( DB_SLAVE ); $this->maxCount = $dbr->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' ); $this->startTime = wfTime(); $db =& $this->backupDb(); $exporter = new WikiMerger( $db, $history, MW_EXPORT_STREAM ); $exporter->setPageCallback( array( &$this, 'reportPage' ) ); $exporter->setRevisionCallback( array( &$this, 'revCount' ) ); $exporter->rewriter = $this->rewriter; $exporter->openStream(); $exporter->dumpFrom($this->rewriter->pageFilter()); $exporter->closeStream(); $this->report( true ); } function &backupDb() { global $wgDBadminuser, $wgDBadminpassword; global $wgDBname; $db =& new Database( $this->backupServer(), $wgDBadminuser, $wgDBadminpassword, $wgDBname ); $timeout = 3600 * 24; $db->query( "SET net_read_timeout=$timeout" ); $db->query( "SET net_write_timeout=$timeout" ); return $db; } function backupServer() { global $wgDBserver; return $this->server ? $this->server : $wgDBserver; } function reportPage( $page ) { $this->pageCount++; $this->report(); } function revCount( $rev ) { $this->revCount++; } function report( $final = false ) { if( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) { $this->showReport(); } } function showReport() { if( $this->reporting ) { $delta = wfTime() - $this->startTime; $now = wfTimestamp( TS_DB ); if( $delta ) { $rate = $this->pageCount / $delta; $revrate = $this->revCount / $delta; $portion = $this->pageCount / $this->maxCount; $eta = $this->startTime + $delta / $portion; $etats = wfTimestamp( TS_DB, intval( $eta ) ); } else { $rate = '-'; $revrate = '-'; $etats = '-'; } global $wgDBname; $this->progress( "$now: $wgDBname $this->pageCount, ETA $etats ($rate pages/sec $revrate revs/sec)" ); } } function progress( $string ) { fwrite( $this->stderr, $string . "\n" ); } } $dumper = new BackupDumper(); if( isset( $options['quiet'] ) ) { $dumper->reporting = false; } if( isset( $options['report'] ) ) { $dumper->reportingInterval = IntVal( $options['report'] ); } if( isset( $options['server'] ) ) { $dumper->server = $options['server']; } if ($args[0]) { $file = $args[0]; } else { $file = "dump.conf"; // default file name } $olddir = getcwd(); chdir( $originalDir ); $lines = file($file); chdir( $olddir ); if ( $lines === false ) { print "Unable to open file {$file}\n"; exit; } $dumper->rewriter = new PageRewriteRules($lines); if ( isset( $options['current'] ) ) { $dumper->dump( MW_EXPORT_CURRENT ); } else { $dumper->dump( MW_EXPORT_FULL); } ?>