read()) { if (stristr($file,'txt')) { echo "delete file $file
\n"; flush(); unlink('./yacat/'.$file); } } } parse("http://yaca.yandex.ru/yca/cat/"); die('ok'); function tolog($str) { $PID = getmypid(); $fp = fopen("yacat.log","a+"); fwrite($fp,strftime("$PID\t%H:%M:%S"."\t$str\n")); fclose($fp); if (file_exists('yacat.stop')) { die("stop"); } } function arrfromurl($url) { tolog("arrfromurl ".$url); $arr = array(array(),array(),array(),array()); $filename = getfilename($url); if (!file_exists($filename)) { tolog("file NOT exist $filename"); return(false); } tolog("file exist $filename"); $filearray = file($filename); foreach ($filearray as $str) { list ($a0, $a1, $a2, $a3) = split("\t", $str); $a0 = trim($a0, " \r\n\t"); $a1 = trim($a1, " \r\n\t"); $a2 = trim($a2, " \r\n\t"); $a3 = trim($a3, " \r\n\t"); array_push ($arr[0], $a0); array_push ($arr[1], $a1); array_push ($arr[2], $a2); array_push ($arr[3], $a3); } return($arr); } function parse_cat($url) { tolog("parse_cat ".$url); $arr = arrfromurl($url); if ($arr) return($arr); $arr = array(array(),array()); $buf = file_get_contents($url); $filtr = '!href\=\"\/'.substr($url,22).'(\w+)\/\"\>(.*?)\<\/a\>!i'; if (preg_match_all($filtr,$buf,$match)) { foreach ($match[1] as $l=>$link) { $title = $match[2][$l]; if (array_search($link, $arr[0])===false) { array_push($arr[0], $link); array_push($arr[1], $title); } } save_cats($url,$arr); } sleep(5); // задержка чтоб не побанили return($arr); } function getfilename($url) { tolog("getfilename ".$url); return('./yacat/cat.'.str_replace('/', '.', substr($url, strpos($url, '/cat/')+5)).'txt'); } function save_cats($cat_url,$arr) { tolog("save_cats ".$cat_url); $file = getfilename($cat_url); $fp=fopen($file,"a+"); foreach($arr[0] as $i=>$ar) { fputs($fp,$arr[0][$i]."\t".$arr[1][$i]."\n"); } fclose($fp); } function save_urls($cat_url,$arr) { tolog("save_urls ".$cat_url); $file = getfilename($cat_url); $fp=fopen($file,"a+"); foreach($arr[0] as $i=>$ar) { fputs($fp,$arr[0][$i]."\t".$arr[1][$i]."\t".$arr[2][$i]."\t".$arr[3][$i]."\n"); } fclose($fp); } function parse($url) { tolog("parse ".$url); $cat_arr = parse_cat($url); if ( (empty($cat_arr[0][0])) or (substr($cat_arr[0][0],0,7)=='http://') ) { // echo "parse_pages\n"; parse_pages($url); } else { foreach($cat_arr[0] as $cat) { // echo "parse\n"; parse($url.$cat.'/'); } } } function parse_pages($cat_url) { tolog("parse_pages ".$cat_url); $j=0; $cashe_arr = arrfromurl($cat_url); $elem_count = count($cashe_arr[3]); tolog("elem_count in cashe_arr = ".$elem_count); while(true) { if (0==$j) { $page_url = $cat_url; } else { $page_url = $cat_url.$j.'.html'; } $j++; if ((($j-1)*20) < $elem_count) { tolog("page already parsed $page_url"); continue; } $arr = parse_page($page_url); if (count($arr[0])==0) { tolog("break"); break; } else { save_urls($cat_url,$arr); } } } // $page например http://yaca.yandex.ru/yca/cat/Portals/37.html // возвращает массив // номер в каталоге, url, анкор, дескрипшен, цитируемость, регион function parse_page($pageurl) { tolog("parse_page ".$pageurl); $arr=array(array(),array(),array(),array()); $buf = file_get_contents($pageurl); if (preg_match_all("!\(.*?)\<\/a\>\(.*?)\!i",$buf,$match1)) { preg_match_all("!Цитируемость\: (\d+)\.!",$buf,$match2); foreach($match1[1] as $i=>$url) { if (array_search($url, $arr[0])===false) { array_push($arr[0], $url); array_push($arr[1], $match1[2][$i]); array_push($arr[2], $match1[3][$i]); array_push($arr[3], $match2[1][$i]); } } } sleep(5); // задержка чтоб не побанили return($arr); } ?>