October 2004 · Stefan Magdalinski · blog post · modern version
<?php
//phpinfo();
/*
wikiproxy.php v0.5 04-10-2004
stefan (wholivesat) whitelabel.org
Wikipediarises http://news.bbc.co.uk, and adds a technorati.com sidebar
to any pages that have been linked from blogs.
This requires a database of wikipedia titles, like all_titles_in_ns0,
imported into a mysql database 'wikiproxy' created like this:
mysql> create table titles (title varchar(190) NOT NULL,
PRIMARY KEY (title)) type=MyISAM;
followed by
mysql> load data infile '/home/stefan/whitelabel.org/wp/all_titles_in_ns0'
ignore into table titles;
The bits of code I didn't borrow from elsewhere (and I've credited where)
is licenced under the GPL. Do with it what you will, but this is my first
php and my first code for 7 years, so I'd appreciate feedback and
suggestions via comments on my blog:
http://www.whitelabel.org/archives/002248.html
*/
set_include_path('/data/vhost/wikiproxy.whitelabel.org/docs/pear/include');
require_once "PEAR.php";
require_once "HTTP/Request.php";
$DATABASE = "wp";
$HOST = "pear";
$USER = "stefan";
$PASSWORD = "XXXX";
class xItem {
var $xTitle;
var $xLink;
var $xPermalink;
}
// Array of technorati links
$arItems = array();
$itemCount = 0;
// Grab the page
if (isset($_GET['url']) and substr($_GET['url'], 13) != "http://news.bbc.co.uk") {
$url_string = $_GET['url'];
} else {
$url_string = 'http://news.bbc.co.uk';
}
$req =& new HTTP_Request($url_string);
if (!PEAR::isError($req->sendRequest())) {
$result = $req->getResponseBody();
// Fix up the stylesheet ref, and wikiproxify the urls
$result = str_replace(
'<link type="text/css" rel="stylesheet" href="',
'<link type="text/css" rel="stylesheet" href="http://news.bbc.co.uk',
$result
);
$result = str_replace(
'<a href="/',
'<a href="/index.php?url=http://news.bbc.co.uk/',
$result
);
$result = str_replace(
'<body marginwidth="0" topmargin="0" leftmargin="0" marginheight="0">',
'<body marginwidth="0" topmargin="0" leftmargin="0" marginheight="0">'
. 'This is <em>not</em> News Online. For details see '
. '<a href="http://www.whitelabel.org/archives/002248.html">this page</a>. '
. 'Supported by backstage.bbc.co.uk'
. '<!-- although when I say supported, I obviously mean '
. '"well, they just have failed to actually sue me yet" -stef -->',
$result
);
// Only process pages that are actually articles (7-digit article ID)
if (preg_match("/\d{7}/", $_GET['url'])) {
$temp = $result;
$temp1 = preg_replace(
"/(.*?(<!-- E IIMA -->.*<!-- E BO -->).*)/ms",
"$2",
$temp
);
$temp1 = wikipedize($temp1);
// Plug it back in
$result = preg_replace(
"/<!-- E IIMA -->(.*)<!-- E BO -->/ms",
$temp1,
$result
);
// Get the technorati links
doCosmos($_GET["url"]);
// Stick them in the right hand nav
if (!count($arItems) < 2) {
// Build a block
$techblock = '<div class="puffbox"><div class="o">'
. '<div class="mph1">BLOGS ABOUT THIS ARTICLE</div>'
. '<img height="5" hspace="0" vspace="0" border="0" width="1" '
. 'alt="" src="http://newsimg.bbc.co.uk/shared/img/o.gif" /><br />';
$techblocktail = '(links from <a href="http://www.technorati.com">'
. 'technorati.com</a>)</div></div>';
for ($i = 0; $i < count($arItems); $i++) {
$test = 1;
$txItem = $arItems[$i];
// Test for dupes
for ($j = 0; $j < count($arItems); $j++) {
$txItemTemp = $arItems[$j];
if ($i != $j && $txItem->xLink == $txItemTemp->xLink) {
$test = 0;
}
}
if ($test == 1) {
if (strlen($txItem->xPermalink) > 5) {
$techblock .= '<div class="arr"><a href="'
. $txItem->xPermalink . '">'
. $txItem->xTitle . '</a><br/></div>';
} else {
$techblock .= '<div class="arr"><a href="'
. $txItem->xLink . '">'
. $txItem->xTitle . '</a><br/></div>';
}
}
}
$techblock .= $techblocktail;
// On BBC News there's no clear hook for the RHN, so use three
$result = preg_replace(
"/(.*)(<div class=\"sah\">|<div class=\"puffbox\">|<div class=\"av1h\")/ms",
"$1{$techblock}$2",
$result
);
}
}
// Serve back the final page
echo $result;
}
// ── Functions ────────────────────────────────────────────────────────────
/**
* Scan text for capitalised phrases and acronyms, match them against a
* MySQL table of Wikipedia titles, and wrap first occurrences in links.
*/
function wikipedize($source)
{
$temp1 = $source;
// Set up regex building blocks
$capsword = "[A-Z][a-zA-Z'0-9]*";
$fillerwords = "of|and|in|on|under|the";
$middlewordre = "(?:$capsword|$fillerwords)\s*";
$endwordre = "(?:$capsword)\s*";
// Match "Two Endwords" or "Endword and Some Middle Words"
$greedyproperre = "/\b($endwordre(?:$middlewordre)*$endwordre)\b/ms";
// Match without filler words — so "Amnesty International and Human
// Rights Watch" also yields "Amnesty International" + "Human Rights Watch"
$frugalproperre = "/\b((?:$endwordre){2,})\b/ms";
preg_match_all($greedyproperre, $temp1, $propernounphrases1);
preg_match_all($frugalproperre, $temp1, $propernounphrases2);
// Three Letter Acronyms (and longer)
preg_match_all("/\b([A-Z]{2,})/ms", $temp1, $acronyms);
// Deduplicate, sort longest first
$phrases = array_unique(array_merge(
$propernounphrases1[0],
$propernounphrases2[0],
$acronyms[0]
));
usort($phrases, "lensort");
// Open DB connection and match candidates against Wikipedia titles
$matched = array();
global $DATABASE, $HOST, $USER, $PASSWORD;
if (!($connection = @mysql_connect($HOST, $USER, $PASSWORD))) {
die("could not connect");
}
if (!(@mysql_select_db("wp", $connection))) {
die("no database found");
}
foreach ($phrases as $phrase) {
$phrase = trim($phrase);
$wikistring = ereg_replace(" ", "_", $phrase);
if (!($match = mysql_query(
"SELECT * FROM titles WHERE title = '"
. mysql_escape_string($wikistring) . "'",
$connection
))) {
die("something went wrong");
}
while ($row = @mysql_fetch_array($match, MYSQL_NUM)) {
$found = $row[0];
if ($found == $wikistring) {
// Skip if already matched a string this one is contained within
$use = true;
foreach ($matched as $got) {
if (strstr($got, trim($phrase))) {
$use = false;
}
}
// Replace first occurrence only
if ($use) {
$temp1 = preg_replace(
"/{$phrase}/",
"<a style=\"text-decoration:underline\" "
. "href=\"http://en.wikipedia.org/wiki/{$wikistring}\">"
. "{$phrase}</a>",
$temp1,
1
);
array_push($matched, $phrase);
}
}
}
}
return $temp1;
}
/**
* Query the Technorati Cosmos API for blog posts linking to this article.
* Results are cached for 30 minutes per article ID.
*
* (Technorati key: 7e64960cc7e9b1cb4315e56a6544fce7)
*/
function doCosmos($urlstring)
{
$techRati = "http://api.technorati.com/cosmos?format=xml"
. "&url=$urlstring&key=7e64960cc7e9b1cb4315e56a6544fce7";
preg_match("/\d{7}/", $urlstring, $nums);
$cacheFilename = $nums[0];
// Fetch if cache is missing, stale (> 30 min), or empty
if (!file_exists("cache/{$cacheFilename}.xml")
|| (time() - filemtime("cache/{$cacheFilename}.xml") > 1800)
|| filesize("cache/{$cacheFilename}.xml") == 0
) {
$a = file($techRati);
set_time_limit(10);
$contents = implode('', $a);
$cachefp = fopen("cache/{$cacheFilename}.xml", "w");
fwrite($cachefp, $contents);
fclose($cachefp);
}
$xml_parser = xml_parser_create();
xml_set_element_handler($xml_parser, "start1Element", "end1Element");
xml_set_character_data_handler($xml_parser, "character1Data");
$fp = fopen("cache/$cacheFilename.xml", "r")
or die("Error reading XML data.");
while ($data = fread($fp, 16384)) {
xml_parse($xml_parser, $data, feof($fp));
}
fclose($fp);
xml_parser_free($xml_parser);
}
// ── XML parser callbacks for Technorati API ──────────────────────────────
function start1Element($parser, $tagName, $attrs)
{
global $curTag;
$curTag .= "^$tagName";
}
function end1Element($parser, $tagName)
{
global $curTag;
$caret_pos = strrpos($curTag, '^');
$curTag = substr($curTag, 0, $caret_pos);
}
function character1Data($parser, $data)
{
global $itemCount, $curTag, $arItems;
$titleKey = "^TAPI^DOCUMENT^ITEM^WEBLOG^NAME";
$permalinkKey = "^TAPI^DOCUMENT^ITEM^NEARESTPERMALINK";
$linkKey = "^TAPI^DOCUMENT^ITEM^WEBLOG^URL";
if ($curTag == $titleKey) {
$arItems[$itemCount] = new xItem();
$arItems[$itemCount]->xTitle = $data;
} elseif ($curTag == $permalinkKey) {
$arItems[$itemCount]->xPermalink = $data;
$itemCount++;
} elseif ($curTag == $linkKey) {
$arItems[$itemCount]->xLink = $data;
}
}
/**
* Strip tags nested inside other tags.
* Credit: Isaac Schlueter (from http://uk2.php.net/strip-tags)
*/
function antiTagInTag($content = '', $format = 'htmlhead')
{
if (!function_exists('format_to_output')) {
function format_to_output($content, $format)
{
return strip_tags($content);
}
}
$contentwalker = 0;
$length = strlen($content);
$tagend = -1;
for (
$tagstart = strpos($content, '<', $tagend + 1);
$tagstart !== false && $tagstart < strlen($content);
$tagstart = strpos($content, '<', $tagend)
) {
$walker = $tagstart + 1;
$open = 1;
while ($open != 0 && $walker < strlen($content)) {
$nextopen = strpos($content, '<', $walker);
$nextclose = strpos($content, '>', $walker);
if ($nextclose === false) {
return $content;
}
if ($nextopen === false || $nextopen > $nextclose) {
$open--;
$walker = $nextclose + 1;
} elseif ($nextopen < $nextclose) {
$open++;
$walker = $nextopen + 1;
}
}
$tagend = $walker;
if ($tagend > strlen($content)) {
$tagend = strlen($content);
} else {
$tagend--;
$tagstart++;
}
$tag = substr($content, $tagstart, $tagend - $tagstart);
$tags[] = '<' . $tag . '>';
$newtag = format_to_output($tag, $format);
$newtags[] = '<' . $newtag . '>';
}
$content = str_replace($tags, $newtags, $content);
return $content;
}
// Sort by string length, longest first
function lensort($a, $b)
{
return strlen($b) - strlen($a);
}
?>