* Export a part of one wiki with rewriting names of page names/links * Useful for merging wikis * Based on dumpBackup.php and SpecialExport.php * * Thanks to Isaac Lo for implementing the mapping of contributor usernames */ /* TODO: For extracting part of a wiki, would be useful to get a list of images used and so can copy only those images template inclusions may not be rewritten properly */ // *** Directly from dumpBackup.php $originalDir = getcwd(); require_once( 'commandLine.inc' ); require_once( 'SpecialExport.php' ); require_once( 'maintenance/backup.inc' ); class PageRewriteRules { var $file = null; var $dumpNamespaces = array(); var $nsRules = array(); var $nsMapping = array(); var $extraSelectFilter = ""; var $generalRewriteRules = array(); var $contributorRewriteRules = array(); function PageRewriteRules($lines) { foreach ($lines as $line) { if (preg_match("/^d (.*)$/", $line, $m)) { $nss = preg_split("/\s*,\s*/", trim($m[1])); foreach ($nss as $ns) { if (is_numeric($ns)) $this->dumpNamespaces[] = $ns; else { print "cannot dump '$ns'. expecting namespace numbers"; exit; } } } else if (preg_match("/^s (.*)$/", $line, $m)) { if ($this->extraSelectFilter) $this->extraSelectFilter = "( $this->extraSelectFilter ) OR ($m[1])"; else $this->extraSelectFilter = $m[1]; } else if (preg_match("/^\\*\s+(.*)\s*=>\s*(.*)$/", $line, $m)) { $this->generalRewriteRules[] = array($m[1], $m[2]); } else if (preg_match("/^u\s+(.*)\s*=>\s*([0-9]+),(.*)$/",$line,$m)) {//Match the format: u => , $this->contributorRewriteRules[] = array(trim($m[1]), trim($m[2]),trim($m[3])); } else if (preg_match("/^(-?[0-9]+)/", $line)) { $curNs = array(); $nss = preg_split("/\s*,\s*/", trim($line)); foreach ($nss as $ns) { if (preg_match("/^(-?[0-9]+)\s*(=>\s*([0-9]+))?/", $ns, $m)) { if (isset($m[2])) { $this->nsMapping[$m[1]] = $m[3]; } $curNs[] = $m[1]; } else { print "$ns not understood, expecting namespace numbers\n"; exit; } } } else if (preg_match("/^\s+(.*)\s*=>\s*(.*)$/", $line, $m)) { if (! $curNs) { print "$line occurred before namespace definition\n"; exit; } foreach ($curNs as $ns) { $this->nsRules[$ns][] = array($m[1], $m[2]); } } // everything else is taken as a comment } } function pageFilter() { // returns SQL WHERE limitation to select pages from namespaces we want only $filter = ""; foreach ($this->dumpNamespaces as $ns) { if ($filter != "") $filter .= " OR "; $filter .= " page_namespace=" . $ns; } if ($this->extraSelectFilter) $filter = "( $filter ) AND ( $this->extraSelectFilter )" ; if (!$filter) return "(1)"; return "($filter)"; } function rewriteTitle(&$namespace, &$title) { $rules = $this->nsRules[$namespace]; if (isset($this->nsMapping[$namespace])) $namespace = $this->nsMapping[$namespace]; if (!$rules) return; foreach ($rules as $rule) { if (preg_match($rule[0], $title)) { $title = preg_replace($rule[0], $rule[1], $title); return; } } } function rewriteUser($userid, $username) { foreach ($this->contributorRewriteRules as $rule) { if(strcasecmp($username,$rule[0])==0) { $userid = $rule[1]; $username = $rule[2]; } } return array($userid,$username); } function rewriteSubpage($srcTitle, &$linkNs, &$linkTitle) { $srcNs = $srcTitle->getNamespace(); $srcPage = $srcTitle->getText(); // possibly rewrite the link title using a subpage relative link if ($srcNs != $linkNs) return; $explodeSrc = explode( '/', $srcPage); $explodeLink = explode( '/', $linkTitle); // find first difference between the components for ($i = 0; $i < count($explodeSrc) && $i < count($explodeLink) && $explodeSrc[$i] == $explodeLink[$i]; $i++) ; // empty if ($i == 0) return; // no common components, not subpage $ret = ""; for ($j = $i; $j < count($explodeSrc); $j++) { $ret .= "../"; } if (!$ret) $ret = "/"; for ($j = $i; $j < count($explodeLink); $j++) { // we want a terminating "/" to not display initial / $ret .= $explodeLink[$j] . "/"; } $linkTitle = $ret; } function rewriteText($srcNs, $srcPage, $newTitle, $text) { // rewrite links in $text located on the original page $srcNs:$srcPage // which is in process of being moved to $newTitle global $wgContLang; // Get internal wiki links and do replacements to match title replacements // some of this from Parser.php/replaceInternalLinks // this doesn't do real article parsing // for example if [[link]] is in comment/nowiki tag, it'll still be replaced static $tc = FALSE; // the % is needed to support urlencoded titles as well if ( !$tc ) { $tc = Title::legalChars() . '#%'; } static $pat = FALSE; if ( !$pat ) { $pat = "/(\\[\\[[{$tc}]+)(\\||]])/s"; } $a = preg_split($pat, $text, -1, PREG_SPLIT_DELIM_CAPTURE); static $parser = FALSE; if (!$parser) { $parser = new Parser(); } $parser->mTitle = Title::makeTitle($srcNs, $srcPage); foreach ($a as &$elem) { if (preg_match("/^\\[\\[(([^:]*):?(.*))$/s", $elem, $m)) { $ns = 0; $page = $m[1]; // expand any subpage links $page = $parser->maybeDoSubpageLink($page, $tmp); if (isset($m[2])) { // don't rewrite interwiki links if (Title::getInterwikiLink($m[2])) continue; $ns = $wgContLang->getNsIndex($m[2]); if (is_numeric($ns)) $page = $m[3]; else $ns = 0; } // force first character of page name to be uppercase // by default, on source wiki is treated as uppercase in this namespace, // but after rewriting, may be no longer be treated as uppercase // leading to the incorrect page being referred to $page = ucfirst($page); $this->rewriteTitle($ns, $page); $this->rewriteSubpage($newTitle, $ns, $page); $title = Title::makeTitle($ns, $page); $elem = "[[" . $title->getPrefixedText(); } } $text = implode('', $a); # handle general rewrite rules foreach ($this->generalRewriteRules as $rule) { $text = preg_replace($rule[0], $rule[1], $text); } return $text; } } class XmlDumpRewriter extends XmlDumpWriter { var $rewriter = null; function XmlDumpRewriter($rewriter) { $this->rewriter = $rewriter; } function getNewTitle($row) { $ns = $row->page_namespace; $title = $row->page_title; $this->rewriter->rewriteTitle(&$ns, &$title); return Title::makeTitle($ns, $title); } function getNewUser($row) { //Wrapper for rewriter class $result=$this->rewriter->rewriteUser($row->rev_user,$row->rev_user_text); $row->rev_user=$result[0]; $row->rev_user_text=$result[1]; //return row; } function openPage( $row ) { $out = " \n"; // *** Begin of added rewriting rule $title = $this->getNewTitle($row); // *** End of added rewriting rule $out .= ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n"; $out .= ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n"; if( '' != $row->page_restrictions ) { $out .= ' ' . wfElement( 'restrictions', array(), strval( $row->page_restrictions ) ) . "\n"; } return $out; } function writeRevision( $row ) { $fname = 'WikiExporter::dumpRev'; wfProfileIn( $fname ); $out = " \n"; $out .= " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n"; $ts = wfTimestamp( TS_ISO_8601, $row->rev_timestamp ); $out .= " " . wfElement( 'timestamp', null, $ts ) . "\n"; if( $row->rev_deleted & Revision::DELETED_USER ) { $out .= " " . wfElement( 'contributor', array( 'deleted' => 'deleted' ) ) . "\n"; } else { $out .= " \n"; //Perform check here to see if we need to do any username replacements //Perform check here, since then you can do IP replacements as well $this->getNewUser($row); if( $row->rev_user ) { $out .= " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n"; $out .= " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n"; } else { $out .= " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n"; } $out .= " \n"; } if( $row->rev_minor_edit ) { $out .= " \n"; } if( $row->rev_deleted & Revision::DELETED_COMMENT ) { $out .= " " . wfElement( 'comment', array( 'deleted' => 'deleted' ) ) . "\n"; } elseif( $row->rev_comment != '' ) { $out .= " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n"; } if( $row->rev_deleted & Revision::DELETED_TEXT ) { $out .= " " . wfElement( 'text', array( 'deleted' => 'deleted' ) ) . "\n"; } elseif( isset( $row->old_text ) ) { // Raw text from the database may have invalid chars $text = strval( Revision::getRevisionText( $row ) ); // *** Begin of added rewriting rule $text = $this->rewriter->rewriteText($row->page_namespace, $row->page_title, $this->getNewTitle($row), $text); // *** End of added rewriting rule $out .= " " . wfElementClean( 'text', array( 'xml:space' => 'preserve' ), strval( $text ) ) . "\n"; } else { // Stub output $out .= " " . wfElement( 'text', array( 'id' => $row->rev_text_id ), "" ) . "\n"; } $out .= " \n"; wfProfileOut( $fname ); return $out; } } class BackupDumperRewriter extends BackupDumper { var $rewriter = null; function processOption( $opt, $val, $param ) { global $originalDir; // extension point for subclasses to add options $config_file = "dump.conf"; // default config file switch ($opt) { case "config": $config_file = $val; break; } $olddir = getcwd(); chdir( $originalDir ); $lines = file($config_file); chdir( $olddir ); if ( $lines === false ) { print "Unable to open config file {$config_file}\n"; exit; } $this->rewriter = new PageRewriteRules($lines); } function dump( $history, $text = MW_EXPORT_TEXT ) { # Notice messages will foul up your XML output even if they're # relatively harmless. ini_set( 'display_errors', false ); $this->initProgress( $history ); $db =& $this->backupDb(); $exporter = new WikiExporter( $db, $history, WikiExporter::STREAM, $text ); $exporter->writer = new XmlDumpRewriter($this->rewriter); $wrapper = new ExportProgressFilter( $this->sink, $this ); $exporter->setOutputSink( $wrapper ); if( !$this->skipHeader ) $exporter->openStream(); $exporter->dumpFrom($this->rewriter->pageFilter()); if( !$this->skipFooter ) $exporter->closeStream(); $this->report( true ); } } $dumper = new BackupDumperRewriter( $argv ); // *** only change from dumpBackup if( isset( $options['quiet'] ) ) { $dumper->reporting = false; } $dumper->skipHeader = isset( $options['skip-header'] ); $dumper->skipFooter = isset( $options['skip-footer'] ); $textMode = isset( $options['stub'] ) ? WikiExporter::STUB : WikiExporter::TEXT; if( isset( $options['full'] ) ) { $dumper->dump( WikiExporter::FULL, $textMode ); } elseif( isset( $options['current'] ) ) { $dumper->dump( WikiExporter::CURRENT, $textMode ); } else { $dumper->progress( << [] Actions: --full Dump complete history of every page. --current Includes only the latest revision of each page. Options: --config=file Config file for rewriting (Default: dump.conf) --quiet Don't dump status reports to stderr. --report=n Report position and speed after every n pages processed. (Default: 100) --server=h Force reading from MySQL server h --skip-header Don't output the header --skip-footer Don't output the footer --stub Don't perform old_text lookups; for 2-pass dump Fancy stuff: --plugin=[:] Load a dump plugin class --output=: Begin a filtered output stream; s: file, gzip, bzip2, 7zip --filter=[:] Add a filter on an output branch END ); } ?>