用Yii2实现网站爬虫将bokee blog的旧博批量导入本站

本人在国内最早的博客网站bokee.com有过一段时间的日志记录,现在看来虽然没有什么特别的用处,也是一些回忆和纪念,因此决定批量导入到本站。

PHP Simple HTML DOM Parser

采用的是Yii2框架结合”PHP Simple HTML DOM Parser”,其地址为:http://sourceforge.net/projects/simplehtmldom/,我用的是它的yii2插件版本keltstr\simplehtmldom。但是由于年久失修,已经不兼容php7+语法,因此,我自己做了一个升级版本,放在我的github上,github地址:https://github.com/deaboway/yii2-simplehtmldom

wp升级之后,code加量的插件不兼容,现在已经弃用了,虽然该插件带来很大方便,但是也让加载速度变慢,直接贴的代码又不是没法看,而且新版本的wp支持代码格式也比较好了。现在就把爬bokee.com上自己小站的代码奉上。我在bokee上的二级域名是:http://deaboway.bokee.com/

<?php

namespace console\controllers;

use Yii;
use yii\console\Controller;
use keltstr\simplehtmldom\SimpleHTMLDom;
use backend\models\BokeeArticle;
use Exception;


class BokeeController extends Controller
{
    public function actionDo()
    {
        //时间戳
        echo "date:".date("Y-m-d H:i:s")."\n\n";

        for($id=1;$id<15;$id++) {
            $this->paserList($id);
        }

    }

    public function actionIndex()
    {
        echo "date:".date("Y-m-d H:i:s")."\n\n";
    }

    /**
     * @param $url
     * http://simplehtmldom.sourceforge.net/manual.htm
     *
    // Find all element which id=foo
    $ret = $html->find('#foo');

    // Find all element which class=foo
    $ret = $html->find('.foo');

    // Find all element has attribute id
    $ret = $html->find('*[id]');

    // Find all anchors and images
    $ret = $html->find('a, img');

    // Find all anchors and images with the "title" attribute
    $ret = $html->find('a[title], img[title]');
     */
    protected function paserList($id=1) {
        $url = "http://deaboway.bokee.com/".$id;

        try {
            // https://blog.csdn.net/txqd1989/article/details/86476561
            // https://blog.csdn.net/qq_36025814/article/details/89500333
            // https://blog.csdn.net/default7/article/details/98314719
            ini_set('user_agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36');
            $html_source = SimpleHTMLDom::file_get_html($url);


            // Find all links
            $bbb = 'nonon';
            foreach($html_source->find('a') as $element) {
                $href = $element->href;
                if(strpos($href,'http://deaboway.bokee.com/1') !== false) {
                    $aaa = $element->href;
                    if($aaa!==$bbb) {
                        // 列出所有的详情列表
                        echo $element->href . "\n";
                        $this->paserDetail($element->href);

                    }
                    $bbb = $aaa;
                }
            }

        } catch(Exception $e) {
            echo $id .": error\n";
            echo '捕获异常:'.$e->getMessage()."\n错误代码:".$e->getCode(). "\n";
            echo ''.$e->getLine()."\n";
            return false;
        }

        return true;
    }

    protected function paserDetail($url) {

        try {
            ini_set('user_agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36');
            $html_source = SimpleHTMLDom::file_get_html($url);

            // https://simplehtmldom.sourceforge.io/manual.htm
            $article = new BokeeArticle();
            $title = $html_source->find('*[itemprop=articleSection]',0)->plaintext;
            echo $title . "\n";
            $datePublished = $html_source->find('*[itemprop=datePublished]',0)->plaintext;
            echo $datePublished . "\n";
            $articleBody = $html_source->find('*[itemprop=articleBody]',0)->innertext;
            $plaintext = $html_source->find('*[itemprop=articleBody]',0)->plaintext;
            $article->url = trim($url);
            if($title) {
                $article->title = trim($title);
            } else {
                $article->title = '空';
            }
            if($datePublished) {
                $article->datetime = trim($datePublished);
            } else {
                $article->datetime = '空';
            }
            if($articleBody) {
                $article->content = trim($articleBody);
            } else {
                $article->content = '无内容';
            }
            if($plaintext) {
                $article->plaintext = trim($plaintext);
            } else {
                $article->plaintext = '无内容';
            }
            $result = $article->save();
            if(!$result) {
                $article->plaintext = '无内容';
                $article->content = '无内容';
                $result = $article->save();

            }
            echo $result ."\n";

        } catch(Exception $e) {
            echo '捕获异常:'.$e->getMessage()."\n错误代码:".$e->getCode(). "\n";
            echo ''.$e->getLine()."\n";
            return false;
        }

        return true;
    }
}

以上代码就把我在bokee所有日志都导入了数据库中,然后用数据库工具直接处理,格式化后导入到本站的数据库表中,表名是:wp_posts

欢迎访问:https://www.deaboway.com

欢迎关注我的微信公众号:

 

如无特殊说明,文章均为本站原创,转载请注明出处!