0
点赞
收藏
分享

微信扫一扫

JAVA根据公众号链接解析文章内容

何晓杰Dev 2022-02-25 阅读 40
package com.cms.modules.crawling.processor;


import com.alibaba.fastjson.JSONObject;
import com.cms.common.exception.RRException;
import com.cms.modules.crawling.model.MpVideo;
import com.cms.modules.crawling.model.WechatArticle;
import com.cms.modules.crawling.util.SnowflakeIdWorker;
import com.cms.modules.oss.CosClient;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
@Slf4j
public class WxArticleCrawlingProcessor {

    /**
     * 文章对象存储路径前缀
     */
    private static final String ARTICLE_OSS_PATH_PREFIX = "article/";

    private static final String WX_CSS_URL = "https://resource.aijiatui.com/bob/www/spider_public/css/weixin.css";

    private static final String EMPTY_HTML = "<!DOCTYPE html>\n" +
            "<html lang=\"zh\">\n" +
            "<head>\n" +
            "    <meta charset=\"UTF-8\">\n" +
            "    <meta content=\"width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0\" name=\"viewport\">\n" +
            "    <link rel=\"stylesheet\" type=\"text/css\" href=\"" + WX_CSS_URL + "\"/>\n" +
            "    <title></title>\n" +
            "</head>\n" +
            "<body></body>\n" +
            "</html>";

    private static final String TITLE_CLASS = "h1.rich_media_title";

    private static final String VOICE_URL = "https://res.wx.qq.com/voice/getvoice?mediaid=";

    /**
     * 解析文章
     * @param url 公众号链接地址
     * @return WechatArticle
     */
    public WechatArticle crawling(String url){
        WechatArticle article = new WechatArticle();
        article.setArticleId(SnowflakeIdWorker.getNextId());
        String articleId = String.valueOf(article.getArticleId());
        if (!checkUrl(url)) {//校验url
            throw new RRException("参数地址错误");
            //throw new RuntimeException("param url error");
        }
        try {
            Document sourceDoc = Jsoup.parse(new URL(url), 60000);
            Document document = Jsoup.parse(EMPTY_HTML); //$n
            document.body().append("<div id=\"docRoot\" class=\"docRoot rich_media_area_primary\"></div>");
            Elements div = document.select("#docRoot");//$div
            //获取标题
            article.setTitle(getArticleTitle(sourceDoc));
            //获取公众号
            article.setSourceName(getSourceName(sourceDoc));
            Element authorHtml = sourceDoc.select("#meta_content").first();
            if (authorHtml == null) return null;
            String author = sourceDoc.select("head meta[name='author']").attr("content");
            String description = sourceDoc.select("head meta[name='description']").attr("content");
            article.setAuthor(author);
            article.setDescription(description);
            article.setCoverImg(getCoverImg(sourceDoc,articleId));
            div.append(authorHtml.html());
            //获取内容
            Element content = sourceDoc.selectFirst(".rich_media_content");
            div.append("<div class=\"rich_media_content \" id=\"js_content\">" + content.html() + "</div>");
            div.select("script").remove();
            document.body().append("<div class=\"rich_media_inner\"></div>");
            document.select(".rich_media_inner").addClass("zh_CN mm_appmsg  appmsg_skin_default appmsg_style_default");
            document.select("section").forEach(e -> {
                String style = e.attr("style");
                if (StringUtils.isNotBlank(style) && style.contains("inline-block") && !style.contains("width") && style.contains("line-height")) {
                    e.attr("style", style.replace("inline-block", "block"));
                }
            });
            // 处理图片节点
            Elements imgElements = document.select("img");
            if (!CollectionUtils.isEmpty(imgElements)) {
                for (Element imgElement : imgElements) {
                    parseImageElement(imgElement,articleId);
                }
            }
            // 处理音频节点
            Elements mpvoiceElements = document.select("mpvoice");
            if (!CollectionUtils.isEmpty(mpvoiceElements)) {
                for (Element voiceElement : mpvoiceElements) {
                    parseVoiceElement(voiceElement,articleId);
                }
            }
            // 处理视频节点
            Elements videoElements = document.select("iframe.video_iframe,iframe.rich_pages");
            if (!CollectionUtils.isEmpty(videoElements)) {
                for (Element videoElement : videoElements) {
                    parseVideoElement(videoElement,articleId);
                }
            }
            // 处理背景图属性
            parseBackgroundImageElement(document,articleId);
            // 处理样式文件
            Elements styles = document.select("[style]");
            if (!CollectionUtils.isEmpty(styles)) {
                StringBuffer sb = new StringBuffer();
                for (Element styleElement : styles) {
                    sb.append(styleElement.attr("style"));
                }
            }
            article.setContent(document.html().replaceAll("\\s{2,}","")/*.replaceAll("\\n","")*/);


            return article;
        } catch (Exception e) {
            log.error("抓取文章错误,url:{},error:{}",url,e.getMessage());
        }
        return null;
    }

    /**
     * 获取文章标题
     *
     * @param sourceDoc
     * @return
     */
    private String getArticleTitle(Document sourceDoc) {
        String title = "";
        if (StringUtils.isNotBlank(title = sourceDoc.select(TITLE_CLASS).text())){
        } else if (StringUtils.isNotBlank(title = sourceDoc.select("head meta[property='twitter:title']").attr("content"))){
        }
        sourceDoc.select("head title").html(title);
        return title;
    }

    /**
     * 获取服务号名称
     *
     * @param sourceDoc
     * @return
     */
    private String getSourceName(Document sourceDoc) {
        String sourceName = "";
        if (StringUtils.isNotBlank(sourceName = sourceDoc.select("strong.profile_nickname").text())){
        }
        return sourceName;
    }

    /**
     * 获取封面图片
     *
     * @param sourceDoc
     * @return
     */
    private String getCoverImg(Document sourceDoc,String articleId) {
        String coverImg = "";
        if (StringUtils.isNotBlank(coverImg = sourceDoc.select("head meta[property='og:image']").attr("content"))){
            coverImg = getOssValue(coverImg, articleId);
        }
        return coverImg;
    }


    /**
     * 图片元素处理
     */
    private void parseImageElement(Element imgElement, String articleId) {

        String imgURL = imgElement.attr("data-src");
        if (StringUtils.isNotBlank(imgURL)) {
            imgElement.attr("data-src", getOssValue(imgURL,articleId));
        }
        // Step2: 处理 src 属性
        String imgURL2 = imgElement.attr("src");
        if (StringUtils.isNotBlank(imgURL2)) {
            if (imgURL2.equals(imgURL)) {
                imgElement.attr("src", imgElement.attr("data-src"));
            } else {
                imgElement.attr("src", getOssValue(imgURL2,articleId));
            }
        } else {
            imgElement.attr("src", imgElement.attr("data-src"));
        }
        String style = imgElement.attr("style");
        if (StringUtils.isNotBlank(style)) {
            imgElement.attr("style","");
        }
    }

    /**
     * 背景图片元素处理
     */
    private void parseBackgroundImageElement(Document sourceDoc, String articleId) {
        for (Element element : sourceDoc.select("*")) {
            if (element.attr("style").length() > 50) { /*background: url("https://mmbiz.qpic.cn/mmbiz_png/hNvibHTw8mRicxmua4icsN5QPpBCGOwFSGdVZUfy5qXHjkvnyAIh1rwfPBZLmz1rLGv3xnmRT9AgwFtO0vmNPTzqQ/640?wx_fmt=png") center center / 100%;padding: 1.5em;box-sizing: border-box;*/
                String style = element.attr("style");
                int begin = style.indexOf("url(\"");
                if (begin <0 ) {
                    continue;
                }
                begin += 5;
                int end = style.indexOf("\")");
                String url = style.substring(begin,end);
                String newUrl = getOssValue(url, articleId);
                style = new StringBuffer(style).replace(begin, end, newUrl).toString();
                element.attr("style", style);
            }
        }
    }

    /**
     * 音频链接处理
     * @param voiceElement
     * @param articleId
     */
    public void parseVoiceElement(Element voiceElement, String articleId) {

        String voiceURL = VOICE_URL + voiceElement.attr("voice_encode_fileid");
        String newURL = getOssValue(voiceURL, articleId);
        if(!StringUtils.isEmpty(newURL)) {
        	voiceElement.attr("voice_encode_fileid", newURL);
        }
    }

    /**
     * 处理腾讯视频节点
     * @param videoElement
     */
    public void parseVideoElement(Element videoElement,String articleId) throws UnsupportedEncodingException {
        String url = videoElement.attr("data-src");
        if (StringUtils.isEmpty(url)) {
            url = videoElement.attr("src");
        }
        String vid = url.substring(url.indexOf("&vid=") + 5);
        if (vid.indexOf("&") > 0) {
            vid = vid.substring(0, vid.indexOf("&") - 1);
        }
        String mpvid = videoElement.attr("data-mpvid");
        if (StringUtils.isNotBlank(mpvid)) {
            // 这种是用户在微信后台直接上传的视频,这种视频放在微信的服务器,不允许跨域,要抓取下来存储
            String videoUrl = getMpvidVideoUrl(mpvid);
            String coverUrl = videoElement.attr("data-cover");
            coverUrl = URLDecoder.decode(coverUrl, "UTF-8");
            videoUrl = getOssValue(videoUrl,articleId);
            coverUrl = getOssValue(coverUrl,articleId);
            videoElement.after("<video poster=\"" + coverUrl + "\" src=\"" + videoUrl + "\" controls=\"controls\" width=\"100%\"></video>");
            videoElement.remove();
        } else if (StringUtils.isNotBlank(vid)) {//腾讯视频,可直接使用
            videoElement.after("<iframe frameborder=\"0\" src=\"https://v.qq.com/txp/iframe/player.html?vid=" + vid + "\" allowFullScreen=\"true\" style=\"width: 100%\"></iframe>");
            videoElement.remove();
        } else {
            videoElement.attr("src",url);
        }
        videoElement.attr("data-src","");
        videoElement.attr("style", videoElement.attr("style") + ";width: 100%;");
    }

    /**
     * 替换style属性中background-img的外部资源引用
     * @param style
     * @return
     */
    public String parseBackgroundImageURL(String style, String articleId) {
        if(StringUtils.isEmpty(style) || style.indexOf("background-image: url(") == -1) {
            return style;
        }
        style = style.replaceAll("&quot;", "\"");
        String regex = "background-image: url\\(\"(.*?)\"\\)";
        Matcher m = Pattern.compile(regex).matcher(style);
        StringBuffer sb = new StringBuffer();
        while(m.find()) {
            String url = m.group(1);
            if (url.startsWith("//res.wx.qq.com")) {
                url = "https:" + url;
            }
            String newURL = getOssValue(url,articleId);
            if (!StringUtils.isEmpty(newURL)) {
                String newValue = String.format("background-image: url(\"%s\")", newURL);
                m.appendReplacement(sb, newValue);
            } else {
                log.info(">>>>>>>>爬取公众号文章背景图片转化失败, url={}", url);
            }
        }
        m.appendTail(sb);
        return sb.toString();
    }

    private String getMpvidVideoUrl(String mpvid) {
        StringBuffer reqUrl = new StringBuffer("https://mp.weixin.qq.com/mp/videoplayer");
        reqUrl.append("?action=get_mp_video_play_url").append("&vid=").append(mpvid)
                .append("&uin=&key=&pass_ticket=&wxtoken=777&appmsg_token=&x5=0&f=json");
        // 获得Http客户端
        CloseableHttpClient httpClient = HttpClientBuilder.create().build();
        HttpGet httpGet = new HttpGet(reqUrl.toString());
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            HttpEntity responseEntity = response.getEntity();
            if (responseEntity != null) {
                String resultStr = EntityUtils.toString(responseEntity);
                MpVideo mpVideo = JSONObject.parseObject(resultStr, MpVideo.class);
                for (MpVideo.UrlInfoBean urlInfo : mpVideo.getUrl_info()) {
                    if (urlInfo.getFilesize() < 1024 * 1024 * 20) {//从超清》高清》流畅遍历,选择<20M的格式,如果都>20M,选流畅格式
                        return urlInfo.getUrl();
                    }
                    if (Objects.equals(10004,urlInfo.getFormat_id())) {
                        return urlInfo.getUrl();
                    }
                }
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                // 释放资源
                if (httpClient != null) {
                    httpClient.close();
                }
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
        /**
         * url_info 中是各个视频格式,选任意一个都可以下载
         * {
         "base_resp": {
         "ret": 0
         },
         "url_info": [
         {
         "url": "http://mpvideo.qpic.cn/tjg_3596690706_50000_c5c62b33b0f84ed080d61c6248f99f53.f10002.mp4?dis_k=27ae49543dc8adefce2dd99af1dec975&dis_t=1561114058",
         "format_id": 10002,
         "duration_ms": 83668,
         "filesize": 4107669,
         "width": 548,
         "height": 366
         },
         {
         "url": "http://mpvideo.qpic.cn/tjg_3596690706_50000_c5c62b33b0f84ed080d61c6248f99f53.f10003.mp4?dis_k=68f0034f55f73b75c31e1528d66dda32&dis_t=1561114058",
         "format_id": 10003,
         "duration_ms": 83668,
         "filesize": 3079454,
         "width": 548,
         "height": 366
         },
         {
         "url": "http://mpvideo.qpic.cn/tjg_3596690706_50000_c5c62b33b0f84ed080d61c6248f99f53.f10004.mp4?dis_k=635e8eec66b796370885ebb0e5ed86eb&dis_t=1561114058",
         "format_id": 10004,
         "duration_ms": 83668,
         "filesize": 3072936,
         "width": 402,
         "height": 270
         }
         ],
         }
         * */
    }


    boolean checkUrl(String url) {
        if (StringUtils.isBlank(url)) {
            return false;
        }
        url.replaceAll(" ","");
        return url.startsWith("https://mp.weixin.qq.com/s") || url.startsWith("http://mp.weixin.qq.com/s");
    }

    private String getOssValue(String strUrl, String articleId){
        try {
            HttpURLConnection conn = null;
            URL url = new URL(strUrl);
            conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            conn.setConnectTimeout(10000);
            final ByteArrayOutputStream output = new ByteArrayOutputStream();
            //IOUtils.copy(conn.getInputStream(), output);
           // String key = ARTICLE_OSS_PATH_PREFIX + articleId + ":" + UUID.randomUUID();
            // todo: oss 上传
            Map<String,String>  map = CosClient.uploadFile(conn.getInputStream());

//            jiaTuiOssApi.uploadFile(output, key);
            return map.get("url");
        } catch (MalformedURLException e) {
            log.error(">>>>>>>>>上传图片失败报错:{}",e);
            return "";
        } catch (IOException e) {
            log.error(">>>>>>>>>上传图片失败报错:{}",e);
            return "";
        }
    }

}

举报

相关推荐

0 条评论