create table titles (title varchar(190) NOT NULL, PRIMARY KEY (title)) type=MyISAM; followed by mysql> load data infile '/home/stefan/whitelabel.org/wp/all_titles_in_ns0' ignore into table titles; The bits of code I didn't borrow from elsewhere (and I've credited where) is licenced under the GPL. Do with it what you will, but this is my first php and my first code for 7 years, so I'd appreciate feedback and suggestions via comments on my blog: http://www.whitelabel.org/archives/002248.html */ set_include_path('/data/vhost/wikiproxy.whitelabel.org/docs/pear/include'); require_once "PEAR.php"; require_once "HTTP/Request.php"; $DATABASE="wp"; $HOST="pear"; $USER="stefan"; $PASSWORD="XXXX"; class xItem { var $xTitle; var $xLink; var $xPermalink; } # array of technorati links $arItems = array(); $itemCount = 0; # grab the page if (isset($_GET['url']) and substr($_GET['url'],13)!= "http://news.bbc.co.uk"){ $url_string = $_GET['url']; }else { $url_string= 'http://news.bbc.co.uk'; } $req =& new HTTP_Request ( $url_string ); if (!PEAR::isError($req->sendRequest())) { $result = $req->getResponseBody(); #now fixup the stylesheet ref, and wikiproxify the urls $result = str_replace ('', 'This is not News Online. for details see this page. Supported by backstage.bbc.co.uk' , $result); #$result = str_replace ('', 'This is not News Online. for details see this page. For discussion only' , $result); # only process pages that are actually articles (i.e. have a 7 digit string in them) if (preg_match ("/\d{7}/", $_GET['url'])) { $temp = $result; $temp1 =preg_replace("/(.*?(.*).*)/ms","$2",$temp); $temp1=wikipedize($temp1); # plug it back in $result =preg_replace("/(.*)/ms", $temp1 ,$result); #reduce page size by 20% by dumping whitespace. # $result = preg_replace("/\s+/ms"," ",$result); # right. now get the technorati links. doCosmos($_GET["url"]); # now we need to stick them in the right hand nav if (!count($arItems)<2){ # first let's build a block $techblock = '
BLOGS ABOUT THIS ARTICLE

'; $techblocktail="(links from technorati.com)
"; for ($i=0;$ixLink == $txItemTemp->xLink) $test = 0; } if ($test == 1) { #if there's a permalink, use that. if (strlen($txItem->xPermalink)>5){ # echo "item:" . $txItem->xPermalink . " " . $txItem->xLink . " " . $txItem.xLink . "
"; $techblock .= "
xPermalink\">$txItem->xTitle
"; } #else just a link else $techblock .= "
xLink\">$txItem->xTitle
"; } } $techblock .=$techblocktail; #now on BBC news there doesn't seem to be a clear hook with which to anchor into the RHN, so we'll have to use three. $result =preg_replace( "/(.*)(
|
|
db = new ParlDB; $temp1 = $source; # Set up various variables $capsword = "[A-Z][a-zA-Z'0-9]*"; # not starting with number, as catches too much $fillerwords = "of|and|in|on|under|the"; $middlewordre = "(?:$capsword|$fillerwords)\s*"; $endwordre = "(?:$capsword)\s*"; # and, of etc. can't appear at ends # Match either "Two Endwords" or "Endword and Some Middle Words" $greedyproperre = "/\b($endwordre(?:$middlewordre)*$endwordre)\b/ms"; # Match without filler words (so if you have a phrase like # "Amnesty International and Human Rights Watch" you also get both parts # separately "Amnesty International" and "Human Rights Watch") $frugalproperre = "/\b((?:$endwordre){2,})\b/ms"; preg_match_all($greedyproperre, $temp1, $propernounphrases1); preg_match_all($frugalproperre, $temp1, $propernounphrases2); # Three Letter Acronyms preg_match_all("/\b([A-Z]{2,})/ms", $temp1, $acronyms); # We don't want no steenking duplicates $phrases = array_unique(array_merge($propernounphrases1[0], $propernounphrases2[0], $acronyms[0])); # Sort into order, largest first usort($phrases, "lensort"); # Open up a db connection, and whittle our list down even further, against # the real titles. $matched = array(); global $DATABASE; global $HOST; global $USER; global $PASSWORD; if (!($connection = @ mysql_connect ($HOST, $USER, $PASSWORD))) die("could not connect"); if (!(@ mysql_select_db ("wp", $connection))) die("no database found"); foreach ($phrases as $phrase) { $phrase = trim($phrase); $wikistring = ereg_replace(" ","_", $phrase); if (!($match = mysql_query ("SELECT * FROM titles WHERE title = '". mysql_escape_string($wikistring). "'", $connection))) die("something went wrong"); while ($row = @ mysql_fetch_array($match, MYSQL_NUM)){ // $q = $this->db->query("SELECT title FROM titles WHERE title = '" . mysql_escape_string($wikistring). "';"); // if ($q->rows > 0) { $found = $row[0]; if ($found == $wikistring) { # See if already matched a string this one is contained within $use = true; foreach ($matched as $got) { if (strstr($got, trim($phrase))) { $use = false; } } # Go ahead only if haven't... if ($use) { # 1 means only replace one match for phrase per paragraph $temp1 = preg_replace ("/{$phrase}/", "{$phrase}", $temp1, 1); array_push($matched, $phrase); } } } } # clean up links with img tags // $temp1 = antiTagInTag ($temp1); return $temp1; } function doCosmos ($urlstring){ $techRati = "http://api.technorati.com/cosmos?format=xml&url=$urlstring&key=7e64960cc7e9b1cb4315e56a6544fce7" ; #echo $techRati; preg_match ("/\d{7}/", $urlstring, $nums) ; $cacheFilename= $nums[0]; //if (file_exists("cache/{$cacheFilename}.xml")) { // is it old? // $modtime = filemtime("cache/{$cacheFilename}.xml"); // need to figure out improve the logic here so it evaluates correctly and doesn't cache too greedily if (!file_exists("cache/{$cacheFilename}.xml")|| (time() - filemtime("cache/{$cacheFilename}.xml") > 1800)||filesize ("cache/{$cacheFilename}.xml")==0 ){ #echo "cache miss!"; $a = file($techRati); set_time_limit (10); $contents = implode('', $a); $cachefp = fopen("cache/{$cacheFilename}.xml", "w"); fwrite($cachefp, $contents); fclose($cachefp); } $xml_parser = xml_parser_create(); xml_set_element_handler($xml_parser, "start1Element", "end1Element"); xml_set_character_data_handler($xml_parser, "character1Data"); $fp = fopen("cache/$cacheFilename.xml", "r") or die("Error reading XML data."); while ($data = fread($fp, 16384)) { // Parse each 4KB chunk with the XML parser created above xml_parse($xml_parser, $data, feof($fp)); #echo "reading..."; } fclose($fp); xml_parser_free($xml_parser); #echo "parsing complete"; global $arItems; global $itemCount; // write out the items #echo count($arItems); } function start1Element($parser, $tagName, $attrs) { global $curTag; $curTag .= "^$tagName"; // echo $counter." ".$curTag."
"; } function end1Element($parser, $tagName) { global $curTag; $caret_pos = strrpos($curTag,'^'); $curTag = substr($curTag,0,$caret_pos); // echo $counter." ".$curTag."
"; } function character1Data($parser, $data) { global $itemCount, $curTag, $arItems; $titleKey = "^TAPI^DOCUMENT^ITEM^WEBLOG^NAME"; $permalinkKey = "^TAPI^DOCUMENT^ITEM^NEARESTPERMALINK"; $linkKey = "^TAPI^DOCUMENT^ITEM^WEBLOG^URL"; if ($curTag == $titleKey) { // make new xItem $arItems[$itemCount] = new xItem(); // set new item object's properties $arItems[$itemCount]->xTitle = $data; } elseif ($curTag == $permalinkKey) { $arItems[$itemCount]->xPermalink = $data; $itemCount++; } elseif ($curTag == $linkKey) { $arItems[$itemCount]->xLink = $data; #$itemCount++; } } #credit: isaac schlueter (lifted from http://uk2.php.net/strip-tags) function antiTagInTag( $content = '', $format = 'htmlhead' ) { if( !function_exists( 'format_to_output' ) ) { // Use the external function if it exists, or fall back on just strip_tags. function format_to_output($content, $format) { return strip_tags($content); } } $contentwalker = 0; $length = strlen( $content ); $tagend = -1; for( $tagstart = strpos( $content, '<', $tagend + 1 ) ; $tagstart !== false && $tagstart < strlen( $content ); $tagstart = strpos( $content, '<', $tagend ) ) { // got the start of a tag. Now find the proper end! $walker = $tagstart + 1; $open = 1; while( $open != 0 && $walker < strlen( $content ) ) { $nextopen = strpos( $content, '<', $walker ); $nextclose = strpos( $content, '>', $walker ); if( $nextclose === false ) { // ERROR! Open waka without close waka! // echo 'Error in antiTagInTag - malformed tag! '; return $content; } if( $nextopen === false || $nextopen > $nextclose ) { // No more opens, but there was a close; or, a close happens before the next open. // walker goes to the close+1, and open decrements $open --; $walker = $nextclose + 1; } elseif( $nextopen < $nextclose ) { // an open before the next close $open ++; $walker = $nextopen + 1; } } $tagend = $walker; if( $tagend > strlen( $content ) ) $tagend = strlen( $content ); else { $tagend --; $tagstart ++; } $tag = substr( $content, $tagstart, $tagend - $tagstart ); $tags[] = '<' . $tag . '>'; $newtag = format_to_output( $tag, $format ); $newtags[] = '<' . $newtag . '>'; $newtag = format_to_output( $tag, $format ); } $content = str_replace($tags, $newtags, $content); return $content; } ?>