本人在国内最早的博客网站bokee.com有过一段时间的日志记录,现在看来虽然没有什么特别的用处,也是一些回忆和纪念,因此决定批量导入到本站。
采用的是Yii2框架结合”PHP Simple HTML DOM Parser”,其地址为:http://sourceforge.net/projects/simplehtmldom/,我用的是它的yii2插件版本keltstr\simplehtmldom。但是由于年久失修,已经不兼容php7+语法,因此,我自己做了一个升级版本,放在我的github上,github地址:https://github.com/deaboway/yii2-simplehtmldom
wp升级之后,code加量的插件不兼容,现在已经弃用了,虽然该插件带来很大方便,但是也让加载速度变慢,直接贴的代码又不是没法看,而且新版本的wp支持代码格式也比较好了。现在就把爬bokee.com上自己小站的代码奉上。我在bokee上的二级域名是:http://deaboway.bokee.com/
<?php
namespace console\controllers;
use Yii;
use yii\console\Controller;
use keltstr\simplehtmldom\SimpleHTMLDom;
use backend\models\BokeeArticle;
use Exception;
class BokeeController extends Controller
{
public function actionDo()
{
//时间戳
echo "date:".date("Y-m-d H:i:s")."\n\n";
for($id=1;$id<15;$id++) {
$this->paserList($id);
}
}
public function actionIndex()
{
echo "date:".date("Y-m-d H:i:s")."\n\n";
}
/**
* @param $url
* http://simplehtmldom.sourceforge.net/manual.htm
*
// Find all element which id=foo
$ret = $html->find('#foo');
// Find all element which class=foo
$ret = $html->find('.foo');
// Find all element has attribute id
$ret = $html->find('*[id]');
// Find all anchors and images
$ret = $html->find('a, img');
// Find all anchors and images with the "title" attribute
$ret = $html->find('a[title], img[title]');
*/
protected function paserList($id=1) {
$url = "http://deaboway.bokee.com/".$id;
try {
// https://blog.csdn.net/txqd1989/article/details/86476561
// https://blog.csdn.net/qq_36025814/article/details/89500333
// https://blog.csdn.net/default7/article/details/98314719
ini_set('user_agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36');
$html_source = SimpleHTMLDom::file_get_html($url);
// Find all links
$bbb = 'nonon';
foreach($html_source->find('a') as $element) {
$href = $element->href;
if(strpos($href,'http://deaboway.bokee.com/1') !== false) {
$aaa = $element->href;
if($aaa!==$bbb) {
// 列出所有的详情列表
echo $element->href . "\n";
$this->paserDetail($element->href);
}
$bbb = $aaa;
}
}
} catch(Exception $e) {
echo $id .": error\n";
echo '捕获异常:'.$e->getMessage()."\n错误代码:".$e->getCode(). "\n";
echo ''.$e->getLine()."\n";
return false;
}
return true;
}
protected function paserDetail($url) {
try {
ini_set('user_agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36');
$html_source = SimpleHTMLDom::file_get_html($url);
// https://simplehtmldom.sourceforge.io/manual.htm
$article = new BokeeArticle();
$title = $html_source->find('*[itemprop=articleSection]',0)->plaintext;
echo $title . "\n";
$datePublished = $html_source->find('*[itemprop=datePublished]',0)->plaintext;
echo $datePublished . "\n";
$articleBody = $html_source->find('*[itemprop=articleBody]',0)->innertext;
$plaintext = $html_source->find('*[itemprop=articleBody]',0)->plaintext;
$article->url = trim($url);
if($title) {
$article->title = trim($title);
} else {
$article->title = '空';
}
if($datePublished) {
$article->datetime = trim($datePublished);
} else {
$article->datetime = '空';
}
if($articleBody) {
$article->content = trim($articleBody);
} else {
$article->content = '无内容';
}
if($plaintext) {
$article->plaintext = trim($plaintext);
} else {
$article->plaintext = '无内容';
}
$result = $article->save();
if(!$result) {
$article->plaintext = '无内容';
$article->content = '无内容';
$result = $article->save();
}
echo $result ."\n";
} catch(Exception $e) {
echo '捕获异常:'.$e->getMessage()."\n错误代码:".$e->getCode(). "\n";
echo ''.$e->getLine()."\n";
return false;
}
return true;
}
}
以上代码就把我在bokee所有日志都导入了数据库中,然后用数据库工具直接处理,格式化后导入到本站的数据库表中,表名是:wp_posts