nutch开发(五)-CFANZ编程社区

nutch开发(五)

开发环境

Linux，Ubuntu20.04LST
IDEA
Nutch1.18
Solr8.11

1.开发插件准备

在源码目录src/plugin目录下创建插件工程目录
编写build.xml，ivy.xml，plugin.xml工程配置文件
编写插件类BlogParser，实现HtmlParseFilter扩展点

2.编写build.xml

因为我会使用到一些parse-html插件中的工具，所以我导入parse-html插件依赖

<?xml version="1.0"?>
<project name="parse-blog" default="jar-core">

  <import file="../build-plugin.xml"/>

  <!-- Build compilation dependencies -->
  <target name="deps-jar">
    <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
    <ant target="deploy" inheritall="false" dir="../parse-html"/>
  </target>

  <!-- Add compilation dependencies to classpath -->
  <path id="plugin.deps">
    <fileset dir="${nutch.root}/build">
      <include name="**/lib-nekohtml/*.jar" />
      <include name="**/parse-html/*.jar" />
    </fileset>
  </path>

  <!-- Deploy Unit test dependencies -->
  <target name="deps-test">
    <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
    <ant target="deploy" inheritall="false" dir="../parse-html"/>
  </target>

</project>

createDocumentFragment()用法总结_逍竹的博客-CSDN博客_documentfragment

3.编写ivy.xml

tagsoup看情况进行导入，一般不会用到tagsoup里面的函数对dom书操作。

<?xml version="1.0" ?>
<ivy-module version="1.0">
    <info organisation="org.apache.nutch" module="${ant.project.name}">
        <license name="Apache 2.0"/>
        <ivyauthor name="Wenyao" url="https://nutch.apache.org/"/>
        <description>
            Wenyao
        </description>
    </info>

    <configurations>
        <include file="../../../ivy/ivy-configurations.xml"/>
    </configurations>

    <publications>
        <!--get the artifact from our module name-->
        <artifact conf="master"/>
    </publications>
    
    <dependencies>
        <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
    </dependencies>
</ivy-module>

4.编写plugin.xml

编写插件描述文件，简简单单实现一个HtmlParseFilter扩展点

<?xml version="1.0" encoding="UTF-8"?>
<plugin
        id="parse-blog"
        name="Blog Parse Plug-in"
        version="1.0.0"
        provider-name="nutch.org">

    <runtime>
        <library name="parse-blog.jar">
            <export name="*"/>
        </library>
        <library name="tagsoup-1.2.1.jar"/>
    </runtime>

    <requires>
        <import plugin="nutch-extensionpoints"/>
        <import plugin="lib-nekohtml"/>
        <import plugin="parse-html"/>
    </requires>

    <extension id="org.apache.nutch.parse.blog"
               name="BlogParser"
               point="org.apache.nutch.parse.HtmlParseFilter">

        <implementation id="BlogParser"
                        class="org.apache.nutch.parse.blog.BlogParser">
        </implementation>

    </extension>

</plugin>

5.实现HtmlParseFilter扩展点

先实现一个基本框架，把HtmlParseFilter扩展点实现获取。

public class BlogParser implements HtmlParseFilter {

    private Configuration conf;
    //这个的自己编写的工具类
    private BlogDataUtils utils;
    
    private static final Logger LOG = LoggerFactory
            .getLogger(MethodHandles.lookup().lookupClass());
    
    @Override
    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        return parseResult;
    }

    @Override
    public void setConf(Configuration configuration) {
        this.conf = configuration;
        this.utils = new BlogDataUtils(getConf());
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }
}

6.代码实现

步骤一获取博客发布时间

现在开始实现代码，首先，我们创建一个工具类，提供一些数据解析方法，该类叫做BlogDataUtils.java，该类市封装了parse-html中的DOMContentUtils.java。

public class BlogDataUtils {
    //parse-html插件中的工具
    private DOMContentUtils utils;
    private static final Logger LOG = LoggerFactory
            .getLogger(MethodHandles.lookup().lookupClass());

    public BlogDataUtils(Configuration configuration){
        this.utils = new DOMContentUtils(configuration);
    }
}

实现一个获取文章时间的函数，其实通过正则匹配到的第一个结构化时间基本就是博客发布时间，除非你爬取的博客在meta标签中有定义一些结构化数据如：pulishedTime，否则我们一般都是用第一次文本正则匹配到的时间当着博客发布时间。

String catchBlogTime(String text){
        //正则匹配表达式，用于获取yyyy-MM-dd 结构的日期数据
        String timeRegex = "(([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|"+
                "((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8]))))|((([0-9]{2})(0[48]|[2468][048]|[13579][26])|"+
                "((0[48]|[2468][048]|[3579][26])00))-02-29)$";
        Pattern p  = Pattern.compile(timeRegex);
        //匹配
        Matcher m   = p.matcher(text);
        //放回匹配数据
        if (m.find())  {
            return text.substring(m.start(),  m.end());
        }
        return null;
}

在BlogParser中使用该函数来解析博客时间。

    @Override
    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        //当前爬取网站的url
        String url = content.getUrl();
        //解析的元数据
        Metadata parseMeta = parseResult.get(url).getData().getParseMeta();
        //parse-html插件解析出来的文本信息，（html网页去掉所有节点标签后的纯文本信息）
        String text = parseResult.get(url).getText();
		//调用刚刚的函数
        String time = utils.catchBlogTime(text);
        //往元数据中加一个(k,v)数据对
        parseMeta.add("pulishedTime",time);

        return parseResult;
    }

步骤二获取网站的ico

public Map catchIco(Node currentNode, Map map){
        String nodeName = currentNode.getNodeName();
    	//ico在节点link中
        if ("link".equalsIgnoreCase(nodeName)){
            //属性节点map对象
            NamedNodeMap attributes = currentNode.getAttributes();
            //拿到属性rel，在w3c中属性也可以当成一个特殊的节点，
            Node rel = attributes.getNamedItem("rel");
            //如果该属性的值等于shortcut icon
            if (rel!=null && rel.getNodeValue().equals("shortcut icon")){
                //拿到href属性中的值
                StringBuffer sb = new StringBuffer(attributes.getNamedItem("href").getNodeValue());
                //放到map中返回
                map.put("metatag.icon",sb);
                return map;
            }
        }
        return map;
    }

步骤三获取博客中某个node中的数据

我们根据网站的域名识别是属于哪个网站下的博客，在做数据定位爬取。在BlogDataUtils.java添加以下函数

public String getHost(String url){
        String host = null;
        try {
            URL u = new URL(url);
            host = u.getHost();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        }
        return host;
    }

在BlogDataUtils.java再添加以下函数，该函数用于获取指定web网站的节点数据。

/**
* 获取网站数据
* @param node 根节点
* @param host
* @return
*/
public HashMap getWebData(Node node,String host){
		//用来装返回的数据的
        HashMap map = new HashMap();
        //节点的迭代器,nutch源码里面的一个工具类
        NodeWalker walker = new NodeWalker(node);
		//遍历每一个节点
        while (walker.hasNext()) {
			//当前节点
            Node currentNode = walker.nextNode();
            //节点标签名称，如:a,img,div
            String nodeName = currentNode.getNodeName();
            //节点类型
            short nodeType = currentNode.getNodeType();
			//判断当前节点是否有属性
            if (currentNode.hasAttributes()){
                //工具网站host执行具体的节点爬取
                switch (host){
                    case "blog.csdn.net":utils.catchCsdnData(currentNode, map); break;  //爬csdn的
                    case "www.cnblogs.com":utils.catchCnblogsData(currentNode,map);break;  //爬博客园的
                    default:utils.catchIco(currentNode,map);  //其他直接只爬取网站的ico
                }
            }
        }
        return map;
}

在BlogDataUtils.java再添加以下函数，该函数用于获取获取CSDN下博客的点赞数和收藏数。

/**
* 获取CSDN下博客的点赞数和收藏数
* @param node 当前节点
* @param map  存储map
* @return
*/
public boolean catchCsdnScore(Node currentNode, Map map){

        //属性节点对象map
        NamedNodeMap attributes = currentNode.getAttributes();
        //class属性节点
        Node classNode = attributes.getNamedItem("class");
    	//id属性节点
        Node idNode = attributes.getNamedItem("id");
    	
        if (classNode!=null && idNode!=null){
            //LOG.info(classNode.getNodeName()+"==="+classNode.getNodeValue());
            //当前节点class="count "(有个空格)并且id="spanCount"的节点
            if ("count ".equals(classNode.getNodeValue()) &&
                    "spanCount".equals(idNode.getNodeValue())){
                StringBuffer sb = new StringBuffer();
                //获取节点的文本信息
                utils.getText(sb, currentNode);
                //装到map里面
                map.put("metatag.good",sb);
                return true;
            }
			//当前节点id="get-collection"
            if ("get-collection".equals(idNode.getNodeValue())){
                StringBuffer sb = new StringBuffer();
                //拿文本
                utils.getText(sb, currentNode);
                map.put("metatag.collections",sb);
                return true;
            }
        }
        return false;
    }