wikiproxy.php v0.5

October 2004 · Stefan Magdalinski · blog post · modern version

<?php
//phpinfo();

/*

  wikiproxy.php v0.5 04-10-2004

  stefan (wholivesat) whitelabel.org

  Wikipediarises http://news.bbc.co.uk, and adds a technorati.com sidebar
  to any pages that have been linked from blogs.

  This requires a database of wikipedia titles, like all_titles_in_ns0,
  imported into a mysql database 'wikiproxy' created like this:

  mysql> create table titles (title varchar(190) NOT NULL,
         PRIMARY KEY (title)) type=MyISAM;

  followed by

  mysql> load data infile '/home/stefan/whitelabel.org/wp/all_titles_in_ns0'
         ignore into table titles;

  The bits of code I didn't borrow from elsewhere (and I've credited where)
  is licenced under the GPL. Do with it what you will, but this is my first
  php and my first code for 7 years, so I'd appreciate feedback and
  suggestions via comments on my blog:

  http://www.whitelabel.org/archives/002248.html

*/

set_include_path('/data/vhost/wikiproxy.whitelabel.org/docs/pear/include');
require_once "PEAR.php";
require_once "HTTP/Request.php";

$DATABASE = "wp";
$HOST     = "pear";
$USER     = "stefan";
$PASSWORD = "XXXX";

class xItem {
    var $xTitle;
    var $xLink;
    var $xPermalink;
}

// Array of technorati links
$arItems   = array();
$itemCount = 0;

// Grab the page
if (isset($_GET['url']) and substr($_GET['url'], 13) != "http://news.bbc.co.uk") {
    $url_string = $_GET['url'];
} else {
    $url_string = 'http://news.bbc.co.uk';
}

$req =& new HTTP_Request($url_string);

if (!PEAR::isError($req->sendRequest())) {
    $result = $req->getResponseBody();

    // Fix up the stylesheet ref, and wikiproxify the urls
    $result = str_replace(
        '<link type="text/css" rel="stylesheet" href="',
        '<link type="text/css" rel="stylesheet" href="http://news.bbc.co.uk',
        $result
    );
    $result = str_replace(
        '<a href="/',
        '<a href="/index.php?url=http://news.bbc.co.uk/',
        $result
    );
    $result = str_replace(
        '<body marginwidth="0" topmargin="0" leftmargin="0" marginheight="0">',
        '<body marginwidth="0" topmargin="0" leftmargin="0" marginheight="0">'
            . 'This is <em>not</em> News Online. For details see '
            . '<a href="http://www.whitelabel.org/archives/002248.html">this page</a>. '
            . 'Supported by backstage.bbc.co.uk'
            . '<!-- although when I say supported, I obviously mean '
            . '"well, they just have failed to actually sue me yet" -stef -->',
        $result
    );

    // Only process pages that are actually articles (7-digit article ID)
    if (preg_match("/\d{7}/", $_GET['url'])) {
        $temp = $result;

        $temp1 = preg_replace(
            "/(.*?(<!-- E IIMA -->.*<!-- E BO -->).*)/ms",
            "$2",
            $temp
        );
        $temp1 = wikipedize($temp1);

        // Plug it back in
        $result = preg_replace(
            "/<!-- E IIMA -->(.*)<!-- E BO -->/ms",
            $temp1,
            $result
        );

        // Get the technorati links
        doCosmos($_GET["url"]);

        // Stick them in the right hand nav
        if (!count($arItems) < 2) {
            // Build a block
            $techblock = '<div class="puffbox"><div class="o">'
                . '<div class="mph1">BLOGS ABOUT THIS ARTICLE</div>'
                . '<img height="5" hspace="0" vspace="0" border="0" width="1" '
                . 'alt="" src="http://newsimg.bbc.co.uk/shared/img/o.gif" /><br />';

            $techblocktail = '(links from <a href="http://www.technorati.com">'
                . 'technorati.com</a>)</div></div>';

            for ($i = 0; $i < count($arItems); $i++) {
                $test   = 1;
                $txItem = $arItems[$i];

                // Test for dupes
                for ($j = 0; $j < count($arItems); $j++) {
                    $txItemTemp = $arItems[$j];
                    if ($i != $j && $txItem->xLink == $txItemTemp->xLink) {
                        $test = 0;
                    }
                }

                if ($test == 1) {
                    if (strlen($txItem->xPermalink) > 5) {
                        $techblock .= '<div class="arr"><a href="'
                            . $txItem->xPermalink . '">'
                            . $txItem->xTitle . '</a><br/></div>';
                    } else {
                        $techblock .= '<div class="arr"><a href="'
                            . $txItem->xLink . '">'
                            . $txItem->xTitle . '</a><br/></div>';
                    }
                }
            }
            $techblock .= $techblocktail;

            // On BBC News there's no clear hook for the RHN, so use three
            $result = preg_replace(
                "/(.*)(<div class=\"sah\">|<div class=\"puffbox\">|<div class=\"av1h\")/ms",
                "$1{$techblock}$2",
                $result
            );
        }
    }

    // Serve back the final page
    echo $result;
}


// ── Functions ────────────────────────────────────────────────────────────

/**
 * Scan text for capitalised phrases and acronyms, match them against a
 * MySQL table of Wikipedia titles, and wrap first occurrences in links.
 */
function wikipedize($source)
{
    $temp1 = $source;

    // Set up regex building blocks
    $capsword    = "[A-Z][a-zA-Z'0-9]*";
    $fillerwords = "of|and|in|on|under|the";
    $middlewordre = "(?:$capsword|$fillerwords)\s*";
    $endwordre   = "(?:$capsword)\s*";

    // Match "Two Endwords" or "Endword and Some Middle Words"
    $greedyproperre = "/\b($endwordre(?:$middlewordre)*$endwordre)\b/ms";

    // Match without filler words — so "Amnesty International and Human
    // Rights Watch" also yields "Amnesty International" + "Human Rights Watch"
    $frugalproperre = "/\b((?:$endwordre){2,})\b/ms";

    preg_match_all($greedyproperre, $temp1, $propernounphrases1);
    preg_match_all($frugalproperre, $temp1, $propernounphrases2);

    // Three Letter Acronyms (and longer)
    preg_match_all("/\b([A-Z]{2,})/ms", $temp1, $acronyms);

    // Deduplicate, sort longest first
    $phrases = array_unique(array_merge(
        $propernounphrases1[0],
        $propernounphrases2[0],
        $acronyms[0]
    ));
    usort($phrases, "lensort");

    // Open DB connection and match candidates against Wikipedia titles
    $matched = array();

    global $DATABASE, $HOST, $USER, $PASSWORD;

    if (!($connection = @mysql_connect($HOST, $USER, $PASSWORD))) {
        die("could not connect");
    }
    if (!(@mysql_select_db("wp", $connection))) {
        die("no database found");
    }

    foreach ($phrases as $phrase) {
        $phrase     = trim($phrase);
        $wikistring = ereg_replace(" ", "_", $phrase);

        if (!($match = mysql_query(
            "SELECT * FROM titles WHERE title = '"
                . mysql_escape_string($wikistring) . "'",
            $connection
        ))) {
            die("something went wrong");
        }

        while ($row = @mysql_fetch_array($match, MYSQL_NUM)) {
            $found = $row[0];
            if ($found == $wikistring) {
                // Skip if already matched a string this one is contained within
                $use = true;
                foreach ($matched as $got) {
                    if (strstr($got, trim($phrase))) {
                        $use = false;
                    }
                }

                // Replace first occurrence only
                if ($use) {
                    $temp1 = preg_replace(
                        "/{$phrase}/",
                        "<a style=\"text-decoration:underline\" "
                            . "href=\"http://en.wikipedia.org/wiki/{$wikistring}\">"
                            . "{$phrase}</a>",
                        $temp1,
                        1
                    );
                    array_push($matched, $phrase);
                }
            }
        }
    }

    return $temp1;
}


/**
 * Query the Technorati Cosmos API for blog posts linking to this article.
 * Results are cached for 30 minutes per article ID.
 *
 * (Technorati key: 7e64960cc7e9b1cb4315e56a6544fce7)
 */
function doCosmos($urlstring)
{
    $techRati = "http://api.technorati.com/cosmos?format=xml"
        . "&url=$urlstring&key=7e64960cc7e9b1cb4315e56a6544fce7";

    preg_match("/\d{7}/", $urlstring, $nums);
    $cacheFilename = $nums[0];

    // Fetch if cache is missing, stale (> 30 min), or empty
    if (!file_exists("cache/{$cacheFilename}.xml")
        || (time() - filemtime("cache/{$cacheFilename}.xml") > 1800)
        || filesize("cache/{$cacheFilename}.xml") == 0
    ) {
        $a = file($techRati);
        set_time_limit(10);
        $contents = implode('', $a);
        $cachefp  = fopen("cache/{$cacheFilename}.xml", "w");
        fwrite($cachefp, $contents);
        fclose($cachefp);
    }

    $xml_parser = xml_parser_create();
    xml_set_element_handler($xml_parser, "start1Element", "end1Element");
    xml_set_character_data_handler($xml_parser, "character1Data");

    $fp = fopen("cache/$cacheFilename.xml", "r")
        or die("Error reading XML data.");

    while ($data = fread($fp, 16384)) {
        xml_parse($xml_parser, $data, feof($fp));
    }

    fclose($fp);
    xml_parser_free($xml_parser);
}


// ── XML parser callbacks for Technorati API ──────────────────────────────

function start1Element($parser, $tagName, $attrs)
{
    global $curTag;
    $curTag .= "^$tagName";
}

function end1Element($parser, $tagName)
{
    global $curTag;
    $caret_pos = strrpos($curTag, '^');
    $curTag    = substr($curTag, 0, $caret_pos);
}

function character1Data($parser, $data)
{
    global $itemCount, $curTag, $arItems;

    $titleKey     = "^TAPI^DOCUMENT^ITEM^WEBLOG^NAME";
    $permalinkKey = "^TAPI^DOCUMENT^ITEM^NEARESTPERMALINK";
    $linkKey      = "^TAPI^DOCUMENT^ITEM^WEBLOG^URL";

    if ($curTag == $titleKey) {
        $arItems[$itemCount] = new xItem();
        $arItems[$itemCount]->xTitle = $data;
    } elseif ($curTag == $permalinkKey) {
        $arItems[$itemCount]->xPermalink = $data;
        $itemCount++;
    } elseif ($curTag == $linkKey) {
        $arItems[$itemCount]->xLink = $data;
    }
}


/**
 * Strip tags nested inside other tags.
 * Credit: Isaac Schlueter (from http://uk2.php.net/strip-tags)
 */
function antiTagInTag($content = '', $format = 'htmlhead')
{
    if (!function_exists('format_to_output')) {
        function format_to_output($content, $format)
        {
            return strip_tags($content);
        }
    }

    $contentwalker = 0;
    $length = strlen($content);
    $tagend = -1;

    for (
        $tagstart = strpos($content, '<', $tagend + 1);
        $tagstart !== false && $tagstart < strlen($content);
        $tagstart = strpos($content, '<', $tagend)
    ) {
        $walker = $tagstart + 1;
        $open   = 1;

        while ($open != 0 && $walker < strlen($content)) {
            $nextopen  = strpos($content, '<', $walker);
            $nextclose = strpos($content, '>', $walker);

            if ($nextclose === false) {
                return $content;
            }

            if ($nextopen === false || $nextopen > $nextclose) {
                $open--;
                $walker = $nextclose + 1;
            } elseif ($nextopen < $nextclose) {
                $open++;
                $walker = $nextopen + 1;
            }
        }

        $tagend = $walker;
        if ($tagend > strlen($content)) {
            $tagend = strlen($content);
        } else {
            $tagend--;
            $tagstart++;
        }

        $tag       = substr($content, $tagstart, $tagend - $tagstart);
        $tags[]    = '<' . $tag . '>';
        $newtag    = format_to_output($tag, $format);
        $newtags[] = '<' . $newtag . '>';
    }

    $content = str_replace($tags, $newtags, $content);
    return $content;
}

// Sort by string length, longest first
function lensort($a, $b)
{
    return strlen($b) - strlen($a);
}

?>