read()) { if($file != '.' && $file != '..') { $arr_keys[$keys_num++] = $cfg_keys_path.$file; } } $my_keys_dir -> close(); // 图片信息 $pic_num = 0; $arr_pic = array(); $my_pic_dir = dir(dirname(__FILE__).'/'.$cfg_pic_path); while($file = $my_pic_dir -> read()) { if($file != '.' && $file != '..') { $arr_pic[$pic_num++] = $cfg_pic_path.$file; } } $my_pic_dir -> close(); // 读取子模板文件夹 $num_sub_temp = 0; $arr_sub_temp = array(); $sub_temp_dir = dir(dirname(__FILE__).'/'.$temp_path); while($file = $sub_temp_dir -> read()) { if($file != '.' && $file != '..') { $arr_sub_temp[$num_sub_temp++] = $temp_path.$file; } } $sub_temp_dir -> close(); // 语料库子目录选取 $num_yuliao = 0; $arr_yuliao = array(); $yuliao_dir = dir(dirname(__FILE__).'/'.$cfg_yuliao_path); while($file = $yuliao_dir -> read()) { if($file != '.' && $file != '..') { $arr_yuliao[$num_yuliao++] = $file; } } $yuliao_dir -> close(); // 随机读取其中一个语料库的内容到其中 $num_sub_yuliao = 0; $arr_sub_yuliao = array(); $this_sub_dir = $arr_yuliao[rand(0, count($arr_yuliao) - 1)]; $sub_yuliao_dir = dir(dirname(__FILE__).'/'.$cfg_yuliao_path.$this_sub_dir); while($file = $sub_yuliao_dir -> read()) { if($file != '.' && $file != '..') { $arr_sub_yuliao[$num_sub_yuliao++] = $cfg_yuliao_path.$this_sub_dir.'/'.$file; } } $sub_yuliao_dir -> close(); // 由于文章库需要经常更新 就不取子目录 仅以最新文章填充 $num_wenzhang = 0; $arr_wenzhang = array(); $dir_wenzhang = dir(dirname(__FILE__).'/'.$cfg_wenzhang_path); while($file = $dir_wenzhang -> read()) { if($file != '.' && $file != '..') { $arr_wenzhang[$num_wenzhang++] = $cfg_wenzhang_path.$file; } } $dir_wenzhang -> close(); // 标题库文件的选取 $num_title = 0; $arr_title = array(); $title_dir = dir(dirname(__FILE__).'/'.$cfg_rand_title); while($file = $title_dir -> read()) { if($file != '.' && $file != '..') { $arr_title[$num_title++] = $cfg_rand_title.'/'.$file; } } $title_dir -> close(); // 选取最新的标题,此标题每天更换 $arr_file_title = file($arr_title[rand(0, count($arr_title) - 1)]); // 外链文件夹的选取 $num_wl = 0; $arr_wl = array(); $wl_dir = dir(dirname(__FILE__).'/'.$cfg_rand_wl); while($file = $wl_dir -> read()) { if($file != '.' && $file != '..') { $arr_wl[$num_wl++] = $cfg_rand_wl.'/'.$file; } } $wl_dir -> close(); $arr_file_wl = file($arr_wl[rand(0, count($arr_wl) - 1)]); // 随机标题库 和 正文内容都通过一个API函数抓取 //$arrData = getData(); // 随机标题库 /* $txt_sjbiaoti = $arrData['title']; // 转码-标题 $encode = mb_detect_encoding($txt_sjbiaoti, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('utf-8' != $encode && 'UTF-8' != $encode) { $txt_sjbiaoti = mb_convert_encoding($txt_sjbiaoti, 'utf-8', $encode); } */ //$arr_sjbiaoti = explode(PHP_EOL, $txt_sjbiaoti); $arr_sjbiaoti = $arr_file_title; // 本次句子库换一种调用方式 while(!$t_filename = $arr_sub_yuliao[rand(0, count($arr_sub_yuliao)) - 1]) { $t_filename = $arr_sub_yuliao[rand(0, count($arr_sub_yuliao)) - 1]; } $arr_juzi = file($t_filename); // 本次的选取的句子库 /* $txt_juzi = $arrData['content']; // 转码-内容 $encode = mb_detect_encoding($txt_juzi, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('utf-8' != $encode && 'UTF-8' != $encode) { $txt_juzi = mb_convert_encoding($txt_juzi, 'utf-8', $encode); } $arr_juzi = explode(PHP_EOL, $txt_juzi); */ // 域名信息 $txt_domain = file_get_contents($cfg_domain_path); $arr_domain = explode(PHP_EOL, $txt_domain); // 随机作者 $txt_author = file_get_contents($cfg_author_path); $arr_author = explode(PHP_EOL, $txt_author); // 随机后缀 $txt_houzhui = file_get_contents($cfg_houzhui_path); $arr_houzhui = explode(PHP_EOL, $txt_houzhui); // 产生随机数字 $arr_rand_num = array('1', '2', '3', '4', '5', '6', '7', '8', '9'); // 产生随机字母 $arr_rand_word = array('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'); // 产生随机字符 $srand_str = '123456789abcdefghijklmnopqrstuvwxyz'; // 本次读取的是哪个关键词文本文件 $txt_one_keys = file_get_contents($arr_keys[rand(0, count($arr_keys) - 1)]); $arr_one_keys = explode(PHP_EOL, $txt_one_keys); // print_r($arr_one_keys); $this_title = $arr_one_keys[rand(0, count($arr_keys) - 1)]; // 开启模板引擎,如果是首页则调用首页模板,如果是内页则调用内页模板 if(!$this_url) { $index_temp = file_get_contents($arr_sub_temp[rand(0, count($arr_sub_temp) - 1)].'/'.'index.html'); //echo "开始调用首页模板文件!
"; } else { // 内容页模板 $index_temp = file_get_contents($arr_sub_temp[rand(0, count($arr_sub_temp) - 1)].'/'.'neiye.html'); //echo "开始调用内页模板文件!
"; } // 开始多模板做数据处理,支持指定格式的模板文件 // 使用preg_match_all匹配标题 if(preg_match_all("//si", $index_temp, $match_title)) { $arr_pure_title = array_unique($match_title[0]); $num_result_title = 0; $arr_result_title = array(); for($i = 0; $i < count($match_title[0]); $i++) { if($arr_pure_title[$i]) { $arr_result_title[$num_result_title++] = $arr_pure_title[$i]; } } for($i = 0; $i < count($arr_result_title); $i++) { $patterns = '/'.$arr_result_title[$i].'/si'; $index_temp = preg_replace($patterns, trim($arr_file_title[rand(0, count($arr_file_title) - 1)]), $index_temp); } } // 使用preg_match_all匹配标题 if(preg_match_all("//si", $index_temp, $match_title)) { $arr_pure_title = array_unique($match_title[0]); $num_result_title = 0; $arr_result_title = array(); for($i = 0; $i < count($match_title[0]); $i++) { if($arr_pure_title[$i]) { $arr_result_title[$num_result_title++] = $arr_pure_title[$i]; } } for($i = 0; $i < count($arr_result_title); $i++) { $patterns = '/'.$arr_result_title[$i].'/si'; $specKwd = trim($arr_one_keys[rand(0, count($arr_one_keys) - 1)]); // 先转成ASCII编码,再转码城unicode编码 /* $encode = mb_detect_encoding($specKwd, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('GB2312' != $encode && 'GBK' != $encode) { $specKwd = mb_convert_encoding($specKwd, 'GB2312', $encode); } $specKwd = unicode_encode($specKwd); */ $index_temp = preg_replace($patterns, trim($specKwd), $index_temp); } } // 匹配当前网页标题 if(strstr($index_temp, '')) { $index_temp = preg_replace('//si', trim($this_title), $index_temp); } // 解析当前日期标签 if(strstr($index_temp, '')) { $index_temp = preg_replace('//si', trim($this_date), $index_temp); } // 解析随机日期标签 while(preg_match("//si", $index_temp, $match_sjriqi)) { $t_s = rand(31536000, 51536000); $time_t = date("Y-m-d", $time - $t_s); $index_temp = preg_replace('//si', trim($time_t), $index_temp, 1); } // 解析随机后缀 if(strstr($index_temp, '')) { $index_temp = preg_replace('//si', trim($arr_houzhui[rand(0, count($arr_houzhui) - 1)]), $index_temp); } // 解析随机作者 if(strstr($index_temp, '')) { $index_temp = preg_replace('//si', trim($arr_author[rand(0, count($arr_author) - 1)]), $index_temp); } // 解析当前时间标签 if(strstr($index_temp, '')) { $index_temp = preg_replace('//si', trim($this_time), $index_temp); } // 匹配当前域名 if(strstr($index_temp, '')) { $index_temp = preg_replace('//si', trim($this_host), $index_temp); } // 匹配固定格式的图片 if(preg_match_all("//si", $index_temp, $match_pic)) { $arr_pure_pic = array_unique($match_pic[0]); $num_result_pic = 0; $arr_result_pic = array(); for($i = 0; $i < count($match_pic[0]); $i++) { if($arr_pure_pic[$i]) { $arr_result_pic[$num_result_pic++] = $arr_pure_pic[$i]; } } for($i = 0; $i < count($arr_result_pic); $i++) { $patterns = '/'.$arr_result_pic[$i].'/si'; $index_temp = preg_replace($patterns, trim($arr_pic[rand(0, count($arr_pic) - 1)]), $index_temp); } } // 匹配固定类型的句子 if(preg_match_all("//si", $index_temp, $match_juzi)) { $arr_pure_juzi = array_unique($match_juzi[0]); $num_result_juzi = 0; $arr_result_juzi = array(); for($i = 0; $i < count($match_juzi[0]); $i++) { if($arr_pure_juzi[$i]) { $arr_result_juzi[$num_result_juzi++] = $arr_pure_juzi[$i]; } } for($i = 0; $i < count($arr_result_juzi); $i++) { $patterns = '/'.$arr_result_juzi[$i].'/si'; $index_temp = preg_replace($patterns, trim($arr_juzi[rand(0, count($arr_juzi) - 1)]), $index_temp); } } // 匹配随机数字 while(preg_match("//si", $index_temp, $match_sjbiaoti)) { $index_temp = preg_replace('//si', trim($arr_one_keys[rand(0, count($arr_one_keys) - 1)]), $index_temp, 1); } // 匹配随机字符 while(preg_match("//si", $index_temp, $match_sjzifu)) { $index_temp = preg_replace('//si', trim($srand_str[rand(0, strlen($srand_str))]), $index_temp, 1); } // 匹配随机外链 while(preg_match("//si", $index_temp, $match_sjwl)) { $index_temp = preg_replace('//si', trim($arr_file_wl[rand(0, count($arr_file_wl) - 1)]), $index_temp, 1); } // 匹配随机关键词 while(preg_match("//si", $index_temp, $match_sjkwd)) { $thekwd = trim($arr_one_keys[rand(0, count($arr_one_keys) - 1)]); $index_temp = preg_replace('//si', $thekwd, $index_temp, 1); } // 匹配随机标题 不限长度 while(preg_match("//si", $index_temp, $match_sjbiaoti)) { $index_temp = preg_replace('//si', trim($arr_sjbiaoti[rand(0, count($arr_sjbiaoti) - 1)]), $index_temp, 1); } // 匹配随机标题 限制长度 if(preg_match_all("//si", $index_temp, $match_sjbiaotiid)) { $arr_pure_sjbiaotiid = array_unique($match_sjbiaotiid[0]); $num_result_sjbiaotiid = 0; $arr_result_sjbiaotiid = array(); for($i = 0; $i < count($match_sjbiaotiid[0]); $i++) { if($arr_pure_sjbiaotiid[$i]) { $arr_result_sjbiaotiid[$num_result_sjbiaotiid++] = $arr_pure_sjbiaotiid[$i]; } } for($i = 0; $i < count($arr_result_sjbiaotiid); $i++) { $patterns = '/'.$arr_result_sjbiaotiid[$i].'/si'; // 检测到了有多少id长度不一样的标题元素集合 $id = str_replace("//si", "", $id); $len = (int)$id; // 计算需要截取的随机标题的长度 while(preg_match($patterns, $index_temp)) { $sj_biaotiid = mb_substr(trim($arr_sjbiaoti[rand(0, count($arr_sjbiaoti) - 1)]), 0, $len, 'utf-8'); $index_temp = preg_replace($patterns, $sj_biaotiid, $index_temp, 1); } } } // 匹配随机句子 while(preg_match("//si", $index_temp, $match_sjjuzi)) { //$this_rndjuzi = get_wyc($arr_juzi[rand(0, count($arr_juzi) - 1)]); $index_temp = preg_replace('//si', trim($arr_juzi[rand(0, count($arr_juzi) - 1)]), $index_temp, 1); } // 匹配随机数字 while(preg_match("//si", $index_temp, $match_sjshuzi)) { $index_temp = preg_replace('//si', trim($arr_rand_num[rand(0, count($arr_rand_num) - 1)]), $index_temp, 1); } // 匹配随机字母 while(preg_match("//si", $index_temp, $match_sjzimu)) { $index_temp = preg_replace('//si', trim($arr_rand_word[rand(0, count($arr_rand_word) - 1)]), $index_temp, 1); } // 匹配随机图片 while(preg_match("//si", $index_temp, $match_sjtupian)) { $index_temp = preg_replace('//si', trim($arr_pic[rand(0, count($arr_pic) - 1)]), $index_temp, 1); } // 匹配随机域名(360站群) while(preg_match("//si", $index_temp, $match_sjyuming)) { $index_temp = preg_replace('//si', trim($arr_domain[rand(0, count($arr_domain) - 1)]), $index_temp, 1); } // 如果根据 文章内容 直接读取,新增两个标签 xs_正文标题 + xs_正文内容 随机取一篇文章 // $which_wenzhang = rand(0, count($arr_wenzhang) - 1); // 本版本 恢复 正文标题 和 内容功能 $arrData = getData(); if(strstr($index_temp, '')) { $zw_title = $arrData['title']; //$zw_title = mb_convert_encoding($zw_title, 'utf-8', 'GBK,GB2312,BIG5'); $index_temp = preg_replace('//si', trim($zw_title), $index_temp); } // 获取正文内容 if(strstr($index_temp, '')) { $finalArc = $arrData['content']; $encode = mb_detect_encoding($finalArc, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('utf-8' != $encode && 'UTF-8' != $encode) { $finalArc = mb_convert_encoding($finalArc, 'utf-8', $encode); } $index_temp = preg_replace('//si', $finalArc, $index_temp); } // 累加路径,保存累加路径 $tmp_path = $this_host.'/'; // 内页 + 落地页为文件页面 if($arr_urls[count($arr_urls) - 1]) { // 分组后,如果最后一个成员为非空,则本路径不是目录,仅供调试输出信息时使用 //echo "本路径不是目录
"; // 对应该域名的缓存目录名称是否存在 if(!is_dir($html_path.$tmp_path)) { mkdir($html_path.$tmp_path); } for($i = 0; $i < count($arr_urls) - 1; $i++) { $tmp_path .= $arr_urls[$i].'/'; if(!is_dir($html_path.$tmp_path)) { mkdir($html_path.$tmp_path); } } // 达到最后一个页面的时候,直接输出到文件 file_put_contents($html_path.$tmp_path.$arr_urls[count($arr_urls) - 1], $index_temp); } // 落地页面为子目录,需自动加上index.html后缀 else { // 否则本路径是目录,仅供调试输出信息时使用 //echo "本次写入的路径是目录文件!
"; // 对应该域名的缓存目录名称是否存在,同以上文件,目录也需要判断对应的域名存放的换粗目录是否存在 if(!is_dir($html_path.$tmp_path)) { mkdir($html_path.$tmp_path); } for($i = 0; $i < count($arr_urls) - 1; $i++) { $tmp_path .= $arr_urls[$i].'/'; if(!is_dir($html_path.$tmp_path)) { mkdir($html_path.$tmp_path); } } // 达到最后一个子目录,直接加上index.html后缀名输出 file_put_contents($html_path.$tmp_path.'index.html', $index_temp); } // 随机获取正文内容函数 function getData() { global $cfg_wenzhang_path; // 子文章库的选取 $num_wenzhang = 0; $arr_wenzhang = array(); $wenzhang_dir = dir(dirname(__FILE__).'/'.$cfg_wenzhang_path); while($file = $wenzhang_dir -> read()) { if($file != '.' && $file != '..') { $arr_wenzhang[$num_wenzhang++] = $file; } } $wenzhang_dir -> close(); // 取其中一个文章子库的内容到其中 $num_sub_wenzhang = 0; $arr_sub_wenzhang = array(); $this_sub_dir = $arr_wenzhang[rand(0, count($arr_wenzhang) - 1)]; $sub_wenzhang_dir = dir(dirname(__FILE__).'/'.$cfg_wenzhang_path.$this_sub_dir); while($file = $sub_wenzhang_dir -> read()) { if($file != '.' && $file != '..') { $arr_sub_wenzhang[$num_sub_wenzhang++] = $cfg_wenzhang_path.$this_sub_dir.'/'.$file; } } $sub_wenzhang_dir -> close(); shuffle($arr_sub_wenzhang); // 取其中一篇文章标题 + 内容来呈现 $file1 = file_get_contents($arr_sub_wenzhang[rand(0, count($arr_sub_wenzhang) - 1)]); $encode = mb_detect_encoding($file1, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('utf-8' != $encode && 'UTF-8' != $encode) { $file1 = mb_convert_encoding($file1, 'utf-8', $encode); } //echo $file1; $arr_theFile = explode("#标题分割#", $file1); if(count($arr_theFile) > 1) { $title = $arr_theFile[0]; $content = $arr_theFile[1]; } /* foreach($arr_sub_wenzhang as $thisFile){ if($file = file_get_contents($thisFile)){ $encode = mb_detect_encoding($file, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('utf-8' != $encode && 'UTF-8' != $encode) { $file = mb_convert_encoding($file, 'utf-8', $encode); } $arr_theFile = explode("#标题分割#", $file); $title_theFile = $arr_theFile[0]; $title_theFile = preg_replace("/<(\/)?[^>]+?>/si", "", $title_theFile); $title_theFile = preg_replace("/(: ){1,}/si", "", $title_theFile); $title_theFile = preg_replace("/(:){1,}/si", "", $title_theFile); $title_theFile = preg_replace("/(:){1,}/si", "", $title_theFile); $title_theFile = preg_replace("/&(.*)?;/si", "", $title_theFile); $title_theFile = preg_replace("/([\s\S]+?)/si", "", $title_theFile); $title_theFile = preg_replace("/([\s\S]+?)/si", "", $title_theFile); $title_theFile = mb_substr($title_theFile, 0, 20, 'utf-8'); preg_match_all("/[a-zA-Z]{1}/",$title_theFile,$arrAl); if(count($arrAl[0]) < 6 && strlen($title_theFile) > 12){ $title .= $title_theFile.PHP_EOL; } } } */ //$content = ''; /* for($i = 0; $i < 100; $i++){ if($file = file_get_contents($arr_sub_wenzhang[$i])){ $encode = mb_detect_encoding($file, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); if('utf-8' != $encode && 'UTF-8' != $encode) { $file = mb_convert_encoding($file, 'utf-8', $encode); } $arr_theFile = explode("#标题分割#", $file); $content_theFile = $arr_theFile[1]; $content_theFile = preg_replace("/(  ){1,}/si", "", $content_theFile); $content_theFile = preg_replace("/<(\/)?[^>]+?>/si", PHP_EOL, $content_theFile); $content_theFile = mb_substr($content_theFile, 0, 108, 'utf-8'); $content .= $content_theFile.'。'; } } $arr_content = explode(PHP_EOL, $content); $content = ''; foreach($arr_content as $contentEach){ preg_match_all("/[a-zA-Z]{1}/",$contentEach,$arrAl); if(strlen($contentEach) > 108 && count($arrAl[0]) < 26 ){ $content .= $contentEach.PHP_EOL; } }*/ $result['title'] = $title; $result['content'] = $content; return $result; } function isGoodCrawler(){ $spiderflag = false; $spiderSite= array( "baiduspider", "baidu", "yisouspider", "360spider", "haosouspider", "sogou", "sosospider" ); foreach($spiderSite as $spider){ $arrmatay = explode($spider, strtolower($_SERVER['HTTP_USER_AGENT'])); if(count($arrmatay) > 1) { $spiderflag = true; } } return $spiderflag; } function isCrawler() { $agent= strtolower($_SERVER['HTTP_USER_AGENT']); if (!empty($agent)) { $spiderSite= array( "TencentTraveler", "Baiduspider+", "Yisouspider", "360Spider", "BaiduGame", "Googlebot", "msnbot", "Sosospider+", "Sogou web spider", "ia_archiver", "Yahoo! Slurp", "YoudaoBot", "Yahoo Slurp", "MSNBot", "Java (Often spam bot)", "BaiDuSpider", "Voila", "Yandex bot", "BSpider", "twiceler", "Sogou Spider", "Speedy Spider", "Google AdSense", "Heritrix", "Python-urllib", "Alexa (IA Archiver)", "Ask", "Exabot", "Custo", "OutfoxBot/YodaoBot", "yacy", "SurveyBot", "legs", "lwp-trivial", "Nutch", "StackRambler", "The web archive (IA Archiver)", "Perl tool", "MJ12bot", "Netcraft", "MSIECrawler", "WGet tools", "larbin", "Fish search", "MauiBot", "MegaIndex", "DotBot", "AlphaBot", "MegaIndex", "semrush" ); foreach($spiderSite as $val) { $str = strtolower($val); if (strpos($agent, $str) !== false) { return true; } } }else { return false; } } /** * $str 原始中文字符串 * $encoding 原始字符串的编码,默认GBK * $prefix 编码后的前缀,默认"&#" * $postfix 编码后的后缀,默认";" */ function unicode_encode($str, $encoding = 'GBK', $prefix = '&#', $postfix = ';') { $str = iconv($encoding, 'UCS-2', $str); $arrstr = str_split($str, 2); $unistr = ''; for($i = 0, $len = count($arrstr); $i < $len; $i++) { $dec = hexdec(bin2hex($arrstr[$i])); $unistr .= $prefix . $dec . $postfix; } return $unistr; } /* //测试php程序运行时间 $t2 = microtime(true); echo round($t2-$t1,3); */ echo $index_temp; ?>