db = new ParlDB; $temp1 = $source; # Set up various variables $capsword = "[A-Z][a-zA-Z'0-9]*"; # not starting with number, as catches too much $fillerwords = "of|and|in|on|under|the"; $middlewordre = "(?:$capsword|$fillerwords)\s*"; $endwordre = "(?:$capsword)\s*"; # and, of etc. can't appear at ends # Match either "Two Endwords" or "Endword and Some Middle Words" $greedyproperre = "/\b($endwordre(?:$middlewordre)*$endwordre)\b/ms"; # Match without filler words (so if you have a phrase like # "Amnesty International and Human Rights Watch" you also get both parts # separately "Amnesty International" and "Human Rights Watch") $frugalproperre = "/\b((?:$endwordre){2,})\b/ms"; preg_match_all($greedyproperre, $temp1, $propernounphrases1); preg_match_all($frugalproperre, $temp1, $propernounphrases2); # Three Letter Acronyms preg_match_all("/\b([A-Z]{2,})/ms", $temp1, $acronyms); # We don't want no steenking duplicates $phrases = array_unique(array_merge($propernounphrases1[0], $propernounphrases2[0], $acronyms[0])); # Sort into order, largest first usort($phrases, "lensort"); # Open up a db connection, and whittle our list down even further, against # the real titles. $matched = array(); global $DATABASE; global $HOST; global $USER; global $PASSWORD; if (!($connection = @ mysql_connect ($HOST, $USER, $PASSWORD))) die("could not connect"); if (!(@ mysql_select_db ("wp", $connection))) die("no database found"); foreach ($phrases as $phrase) { $phrase = trim($phrase); $wikistring = ereg_replace(" ","_", $phrase); if (!($match = mysql_query ("SELECT * FROM titles WHERE title = '". mysql_escape_string($wikistring). "'", $connection))) die("something went wrong"); while ($row = @ mysql_fetch_array($match, MYSQL_NUM)){ // $q = $this->db->query("SELECT title FROM titles WHERE title = '" . mysql_escape_string($wikistring). "';"); // if ($q->rows > 0) { $found = $row[0]; if ($found == $wikistring) { # See if already matched a string this one is contained within $use = true; foreach ($matched as $got) { if (strstr($got, trim($phrase))) { $use = false; } } # Go ahead only if haven't... if ($use) { # 1 means only replace one match for phrase per paragraph $temp1 = preg_replace ("/{$phrase}/", "{$phrase}", $temp1, 1); array_push($matched, $phrase); } } } } # clean up links with img tags // $temp1 = antiTagInTag ($temp1); return $temp1; } function doCosmos ($urlstring){ $techRati = "http://api.technorati.com/cosmos?format=xml&url=$urlstring&key=7e64960cc7e9b1cb4315e56a6544fce7" ; #echo $techRati; preg_match ("/\d{7}/", $urlstring, $nums) ; $cacheFilename= $nums[0]; //if (file_exists("cache/{$cacheFilename}.xml")) { // is it old? // $modtime = filemtime("cache/{$cacheFilename}.xml"); // need to figure out improve the logic here so it evaluates correctly and doesn't cache too greedily if (!file_exists("cache/{$cacheFilename}.xml")|| (time() - filemtime("cache/{$cacheFilename}.xml") > 1800)||filesize ("cache/{$cacheFilename}.xml")==0 ){ #echo "cache miss!"; $a = file($techRati); set_time_limit (10); $contents = implode('', $a); $cachefp = fopen("cache/{$cacheFilename}.xml", "w"); fwrite($cachefp, $contents); fclose($cachefp); } $xml_parser = xml_parser_create(); xml_set_element_handler($xml_parser, "start1Element", "end1Element"); xml_set_character_data_handler($xml_parser, "character1Data"); $fp = fopen("cache/$cacheFilename.xml", "r") or die("Error reading XML data."); while ($data = fread($fp, 16384)) { // Parse each 4KB chunk with the XML parser created above xml_parse($xml_parser, $data, feof($fp)); #echo "reading..."; } fclose($fp); xml_parser_free($xml_parser); #echo "parsing complete"; global $arItems; global $itemCount; // write out the items #echo count($arItems); } function start1Element($parser, $tagName, $attrs) { global $curTag; $curTag .= "^$tagName"; // echo $counter." ".$curTag."
"; } function end1Element($parser, $tagName) { global $curTag; $caret_pos = strrpos($curTag,'^'); $curTag = substr($curTag,0,$caret_pos); // echo $counter." ".$curTag."
"; } function character1Data($parser, $data) { global $itemCount, $curTag, $arItems; $titleKey = "^TAPI^DOCUMENT^ITEM^WEBLOG^NAME"; $permalinkKey = "^TAPI^DOCUMENT^ITEM^NEARESTPERMALINK"; $linkKey = "^TAPI^DOCUMENT^ITEM^WEBLOG^URL"; if ($curTag == $titleKey) { // make new xItem $arItems[$itemCount] = new xItem(); // set new item object's properties $arItems[$itemCount]->xTitle = $data; } elseif ($curTag == $permalinkKey) { $arItems[$itemCount]->xPermalink = $data; $itemCount++; } elseif ($curTag == $linkKey) { $arItems[$itemCount]->xLink = $data; #$itemCount++; } } #credit: isaac schlueter (lifted from http://uk2.php.net/strip-tags) function antiTagInTag( $content = '', $format = 'htmlhead' ) { if( !function_exists( 'format_to_output' ) ) { // Use the external function if it exists, or fall back on just strip_tags. function format_to_output($content, $format) { return strip_tags($content); } } $contentwalker = 0; $length = strlen( $content ); $tagend = -1; for( $tagstart = strpos( $content, '<', $tagend + 1 ) ; $tagstart !== false && $tagstart < strlen( $content ); $tagstart = strpos( $content, '<', $tagend ) ) { // got the start of a tag. Now find the proper end! $walker = $tagstart + 1; $open = 1; while( $open != 0 && $walker < strlen( $content ) ) { $nextopen = strpos( $content, '<', $walker ); $nextclose = strpos( $content, '>', $walker ); if( $nextclose === false ) { // ERROR! Open waka without close waka! // echo 'Error in antiTagInTag - malformed tag! '; return $content; } if( $nextopen === false || $nextopen > $nextclose ) { // No more opens, but there was a close; or, a close happens before the next open. // walker goes to the close+1, and open decrements $open --; $walker = $nextclose + 1; } elseif( $nextopen < $nextclose ) { // an open before the next close $open ++; $walker = $nextopen + 1; } } $tagend = $walker; if( $tagend > strlen( $content ) ) $tagend = strlen( $content ); else { $tagend --; $tagstart ++; } $tag = substr( $content, $tagstart, $tagend - $tagstart ); $tags[] = '<' . $tag . '>'; $newtag = format_to_output( $tag, $format ); $newtags[] = '<' . $newtag . '>'; $newtag = format_to_output( $tag, $format ); } $content = str_replace($tags, $newtags, $content); return $content; } ?>