# Contacts a mediawiki server and de-wikifies it making it appear to be a static page
#
# Author: Austin Che
# See http://openwetware.org/wiki/User:Austin/Extensions/Dewikify
# 
# 2005/09/15: Initial version
# 2005/11/25: Dewikifying for multiple virtual hosts
# 2005/12/15: Page templates for categories
# 2006/01/13: add mediawiki extension to allow text to only show up either on wiki/non-wiki
# 2006/04/29: Change script to always use dewikify=yes instead of action=dewikify for the extension
# 2006/07/25: Change method of determining category from virtual server name
#
# To use this, put this library file in a location readable by the webserver
# It does not need to be under the document root for the webserver
# Create an index.cgi file to be executed by the webserver
# Load this library in the file and overwrite whatever variables that 
# you wish to customize, e.g.:
# ---------
# #!/usr/bin/perl -Tw
# use strict;
# use lib "../lib";
# use Dewikify;
# $Dewikify::WIKI_CACHE_DIR = "/home/web/tmp/cache";
# $Dewikify::CACHE_LIFE = 1;
# %Dewikify::MAPPING = ( 
#                         "foo" => "Foo_Bar",
#                         "bar" => "Hello",
#                         );
# Dewikify::main();
# ---------
#
# If you use the associated dewikify.php mediawiki extension (not necessary),
# on your wiki, new tags will be defined to allow text to only show up on the wiki
# or only on the dewikified version.
# 
# You'll also need to add some mod_rewrite rules. Here are what I use:
#        RewriteEngine On
#        # first point the empty uri to the script
#        RewriteRule ^/$ /index.cgi [L]
#        # rewrite everything else ending in html to the script also
#        RewriteRule ^/(.*)\.html$ /index.cgi?page=$1 [L]
#        # this is needed for our edit trick where we add x to end of url to edit it
#        RewriteRule ^/(.*x)$ /index.cgi?page=$1 [L]

package Dewikify;

use strict;
use CGI qw/:standard/;
use CGI::Carp qw(fatalsToBrowser);

# global defaults
our $WIKI_CACHE_DIR = "/tmp"; # should be writable for web server
our $CACHE_LIFE = 0;             # cache lifetime in days
our $WIKI_SERVER = "http://openwetware.org"; # base location of wiki
our $DEWIKIFY_NAMESPACE = "Dewikify"; # stuff to control this program on the wiki
our $DEWIKIFY_DEFAULT_TEMPLATE = "Dewikify:Default"; # name of default template
our $TEMPLATE_BODY = "!BODY!";  # what is replaced by the page's body in templates
our $DEFAULT_TEMPLATE = "$TEMPLATE_BODY";  # if even DEWIKIFY_DEFAULT_TEMPLATE page doesn't exist
our $WIKI_PATH = "/wiki/"; # deal with aliased "clean" path
our %MAPPING = ( ); # mapping from server names to wiki categories

# full url for retrieving a page
# this script designed for default monobook
# dewikify=yes is for if the dewikify extension is installed
# should have no effect otherwise
our $WIKI_REAL_PATH = "/index.php?useskin=monobook&dewikify=yes&title="; 

# set this to something if it should be fixed
# otherwise it will be set depending on $SERVER
our $WIKI_CATEGORY;

# these depend on the virtual host we're called by
our $SERVER = server_name();
our $MY_URL = url(-base=>1);
our $FULL_URL = url();

our $CURL_PROG = "/usr/bin/curl"; # path to curl

# untaint some environment variables
$ENV{"CDPATH"} = "";
$ENV{"PATH"} = "";

sub do_redirect($)
{
    my ($url) = @_;
    print redirect($url);
    exit;
}

sub clear_cache()
{
    opendir(CACHEDIR, $WIKI_CACHE_DIR) || die "cannot open cache directory: $!";
    # remove all files in directory that are in the current category or general namespace
    # or expired files in the cache
    foreach my $file (readdir(CACHEDIR))
    {
        next if ($file !~ m/([\w%]*)/);
        $file = $1;
        my $path = "$WIKI_CACHE_DIR/$file";
        unlink($path) if (($file =~ m/^$WIKI_CATEGORY/) || 
                          ($file =~ m/^$DEWIKIFY_NAMESPACE/) || ( -M $path > $CACHE_LIFE));
    }
    closedir(CACHEDIR);
}

sub encode_safe_url($)
{
    # we just encode non-alpha characters
    # this removes any potentially dangerous characters
    # we untaint the result
    my ($url) = @_;
    $url =~ s/(\W)/sprintf("%%%02X", ord($1))/seg;
    # the following reverses the above encoding
    # $url =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg;
    $url =~ /([\w%]*)/;
    return $1;
}

sub get_current_page($)
{
    my ($page) = @_;
    my $data;
    my $content;
    my $header;
    my $template;

    # get current version of page from wiki, need to untaint $page
    open(PAGE, "$CURL_PROG -q -s -f \"${WIKI_SERVER}${WIKI_REAL_PATH}" . encode_safe_url($page) . "\" |") || return;
    $data = join("", <PAGE>);
    close(PAGE);

    # check if page doesn't exist
    return undef if ($data =~ m@<!-- start content -->.*There is currently no text in this page@s);

    $data =~ m@^(.*</head>).*<!-- start content -->(.*)<!-- end content -->@s || die "Unable to parse wiki page: $data";
    $header = $1;
    $content = $2;

    # make all related pages point back to us
    # make sure to detect internal links (# in the regexp)
    $content =~ s@href="${WIKI_PATH}$WIKI_CATEGORY:([^"#]*)@href="$MY_URL/$1.html@g;
    $content =~ s@href="${WIKI_PATH}$WIKI_CATEGORY(["#]?)@href="$MY_URL/$1@g;
    $content =~ s@href=\"\#@href=\"$FULL_URL\#@g;

    # remove image links
    $content =~ s@<a href=\"${WIKI_PATH}Image:.*?>(<img.*?>)</a>@$1@gs;

    # remove editing things
    #$content =~ s@<div class="editsection".*?</div>@@g; # section edits
    $content =~ s@<span class="editsection".*?</span>@@g; # section edits
    $content =~ s@<a href=\"/[^>]*?action=edit.*?>(.*?)</a>@$1@gs; # links to non-existent pages

    # footer not needed
    $content =~ s@(<!-- Saved in parser cache with key.*)?<div class="printfooter">.*?</div>@@s;

    # for things under dewikify namespace, don't add headers, just content
    return $content if ( $page =~ m@^$DEWIKIFY_NAMESPACE@);

    $header =~ s@<head>@<head><base href="$WIKI_SERVER">@;

    # don't want the wikipedia favicon.ico
    $header =~ s@<link rel="shortcut icon" href="/favicon.ico" />@@s;

    # make a nice title from the page name
    $page =~ s@_@ @g;
    $page =~ s@:@: @g;
    $header =~ s@<title>.*</title>@<title>$page</title>@;

    # the default skin puts a background image which we don't want, so overwrite with white background
    $header =~ s@</head>@\n<style>body {background: white;}</style>\n</head>\n@s;

    # get the page template
    $template = get_page($DEWIKIFY_NAMESPACE . ":" . $WIKI_CATEGORY);
    $template = get_page($DEWIKIFY_DEFAULT_TEMPLATE) if (! defined($template));
    $template = $DEFAULT_TEMPLATE if (! $template);

    $template = $DEFAULT_TEMPLATE if ($template !~ m/$TEMPLATE_BODY/);

    # fill template with our page's content
    $template =~ s@$TEMPLATE_BODY@$content@s;
    return "${header}\n<body><div id='globalWrapper'>\n${template}\n</div></body></html>";
}

sub update_cache($$)
{
    my ($cachefile, $page) = @_;
    my $data;

    $data = get_current_page($page);
    return undef if (! $data);

    open(CACHEFILE, ">$cachefile") || die "Cannot open cache file $cachefile: $!";
    print CACHEFILE $data;
    close(CACHEFILE);
    return $data;
}

sub get_page($)
{
    my ($page) = @_;
    my $data;
    my $cachefile;

    # for cache file, want to make the page name safe for a filename
    # i.e. rewrite chars like "/" using standard url encoding
    $cachefile = "${WIKI_CACHE_DIR}/" . encode_safe_url($page);

    # check if cache file exists
    if (! -r $cachefile || -M $cachefile > $CACHE_LIFE)
    {
        # update cache
        $data = update_cache($cachefile, $page);
        return undef if (! defined($data)); #  if page dosen't exist
    }

    if (! $data)
    {
        # read from cache
        open(PAGE, "<$cachefile") || die "cannot open cache $cachefile: $!";
        $data = join("", <PAGE>);
        close(PAGE);
    }

    return $data;
}

sub output_page($)
{
    my ($data) = @_;
    # UTF-8 required for the wiki page, default ISO-8859-1 isn't good enough
    print header(-charset=>'UTF-8');
    print $data;
}

sub check_edit_redirect($)
{
    my ($page) = @_;

    # allow shorthand for editing pages (add an x to the end of the url)
    my $redirect;

    $redirect = $WIKI_CATEGORY if ($page eq "x");
    if ($page =~ /(.*)\.htmlx$/)
    {
        $page = $1;                 # untaint
        $redirect = "$WIKI_CATEGORY:$page"
    }

    if ($redirect)
    {
        # clear cache on an edit redirect
        clear_cache();
        do_redirect("${WIKI_SERVER}${WIKI_REAL_PATH}$redirect&action=edit");
    }
}

sub get_category_from_server()
{
    # first use full server name, e.g. www.foo.org, 
    # then try only last part, e.g. foo.org, 
    # finally try only first part, e.g. www
    # each is looked up in the mapping table

    my $name = $SERVER;
    if (! $MAPPING{$name})      # full server name
    {
        $name = $1 if ($SERVER =~ /([^.]*\.[^.]*)$/);
        if (! $MAPPING{$name})  # test the top domain
        {
            # set to first part
            $name = $1 if ( $SERVER =~ /^([^.]*)/ );
        }
    }

    my $category = $name;
    $category = $MAPPING{$name} if ($MAPPING{$name});
    $category =~ s/^(.)/\U$1/;

    return $category;
}

sub main()
{
    my $page = param('page') ? param('page') : "";
    my $content;

    $WIKI_CATEGORY = get_category_from_server() if (! defined($WIKI_CATEGORY));

    check_edit_redirect($page);
    
    # for mediawiki, valid characters are the ones between 32-126 inclusive
    # except for the following  # + < > [ ] | { }
    if ($page =~ /./)
    {
        # remove invalid characters and untaint page
        $page =~ s@[][\#+<>|{}]@@g; 
        $page =~ s@[^ -~]@@g;
        $page = ($page =~ /./ ? ":$page" : "");
    }
    
    # make full wiki title with category
    $content = get_page("$WIKI_CATEGORY$page");

    # if $content is undef then page doesn't exist
    # if page was given in url, then we jump up to root
    # otherwise if root doesn't exist, jump to wiki home page
    do_redirect(param('page') ? $MY_URL : $WIKI_SERVER) if (! defined($content));

    output_page($content);
}

1;
