package com.cms.modules.crawling.processor;
import com.alibaba.fastjson.JSONObject;
import com.cms.common.exception.RRException;
import com.cms.modules.crawling.model.MpVideo;
import com.cms.modules.crawling.model.WechatArticle;
import com.cms.modules.crawling.util.SnowflakeIdWorker;
import com.cms.modules.oss.CosClient;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Component
@Slf4j
public class WxArticleCrawlingProcessor {
private static final String ARTICLE_OSS_PATH_PREFIX = "article/";
private static final String WX_CSS_URL = "https://resource.aijiatui.com/bob/www/spider_public/css/weixin.css";
private static final String EMPTY_HTML = "<!DOCTYPE html>\n" +
"<html lang=\"zh\">\n" +
"<head>\n" +
" <meta charset=\"UTF-8\">\n" +
" <meta content=\"width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0\" name=\"viewport\">\n" +
" <link rel=\"stylesheet\" type=\"text/css\" href=\"" + WX_CSS_URL + "\"/>\n" +
" <title></title>\n" +
"</head>\n" +
"<body></body>\n" +
"</html>";
private static final String TITLE_CLASS = "h1.rich_media_title";
private static final String VOICE_URL = "https://res.wx.qq.com/voice/getvoice?mediaid=";
public WechatArticle crawling(String url){
WechatArticle article = new WechatArticle();
article.setArticleId(SnowflakeIdWorker.getNextId());
String articleId = String.valueOf(article.getArticleId());
if (!checkUrl(url)) {
throw new RRException("参数地址错误");
}
try {
Document sourceDoc = Jsoup.parse(new URL(url), 60000);
Document document = Jsoup.parse(EMPTY_HTML);
document.body().append("<div id=\"docRoot\" class=\"docRoot rich_media_area_primary\"></div>");
Elements div = document.select("#docRoot");
article.setTitle(getArticleTitle(sourceDoc));
article.setSourceName(getSourceName(sourceDoc));
Element authorHtml = sourceDoc.select("#meta_content").first();
if (authorHtml == null) return null;
String author = sourceDoc.select("head meta[name='author']").attr("content");
String description = sourceDoc.select("head meta[name='description']").attr("content");
article.setAuthor(author);
article.setDescription(description);
article.setCoverImg(getCoverImg(sourceDoc,articleId));
div.append(authorHtml.html());
Element content = sourceDoc.selectFirst(".rich_media_content");
div.append("<div class=\"rich_media_content \" id=\"js_content\">" + content.html() + "</div>");
div.select("script").remove();
document.body().append("<div class=\"rich_media_inner\"></div>");
document.select(".rich_media_inner").addClass("zh_CN mm_appmsg appmsg_skin_default appmsg_style_default");
document.select("section").forEach(e -> {
String style = e.attr("style");
if (StringUtils.isNotBlank(style) && style.contains("inline-block") && !style.contains("width") && style.contains("line-height")) {
e.attr("style", style.replace("inline-block", "block"));
}
});
Elements imgElements = document.select("img");
if (!CollectionUtils.isEmpty(imgElements)) {
for (Element imgElement : imgElements) {
parseImageElement(imgElement,articleId);
}
}
Elements mpvoiceElements = document.select("mpvoice");
if (!CollectionUtils.isEmpty(mpvoiceElements)) {
for (Element voiceElement : mpvoiceElements) {
parseVoiceElement(voiceElement,articleId);
}
}
Elements videoElements = document.select("iframe.video_iframe,iframe.rich_pages");
if (!CollectionUtils.isEmpty(videoElements)) {
for (Element videoElement : videoElements) {
parseVideoElement(videoElement,articleId);
}
}
parseBackgroundImageElement(document,articleId);
Elements styles = document.select("[style]");
if (!CollectionUtils.isEmpty(styles)) {
StringBuffer sb = new StringBuffer();
for (Element styleElement : styles) {
sb.append(styleElement.attr("style"));
}
}
article.setContent(document.html().replaceAll("\\s{2,}",""));
return article;
} catch (Exception e) {
log.error("抓取文章错误,url:{},error:{}",url,e.getMessage());
}
return null;
}
private String getArticleTitle(Document sourceDoc) {
String title = "";
if (StringUtils.isNotBlank(title = sourceDoc.select(TITLE_CLASS).text())){
} else if (StringUtils.isNotBlank(title = sourceDoc.select("head meta[property='twitter:title']").attr("content"))){
}
sourceDoc.select("head title").html(title);
return title;
}
private String getSourceName(Document sourceDoc) {
String sourceName = "";
if (StringUtils.isNotBlank(sourceName = sourceDoc.select("strong.profile_nickname").text())){
}
return sourceName;
}
private String getCoverImg(Document sourceDoc,String articleId) {
String coverImg = "";
if (StringUtils.isNotBlank(coverImg = sourceDoc.select("head meta[property='og:image']").attr("content"))){
coverImg = getOssValue(coverImg, articleId);
}
return coverImg;
}
private void parseImageElement(Element imgElement, String articleId) {
String imgURL = imgElement.attr("data-src");
if (StringUtils.isNotBlank(imgURL)) {
imgElement.attr("data-src", getOssValue(imgURL,articleId));
}
String imgURL2 = imgElement.attr("src");
if (StringUtils.isNotBlank(imgURL2)) {
if (imgURL2.equals(imgURL)) {
imgElement.attr("src", imgElement.attr("data-src"));
} else {
imgElement.attr("src", getOssValue(imgURL2,articleId));
}
} else {
imgElement.attr("src", imgElement.attr("data-src"));
}
String style = imgElement.attr("style");
if (StringUtils.isNotBlank(style)) {
imgElement.attr("style","");
}
}
private void parseBackgroundImageElement(Document sourceDoc, String articleId) {
for (Element element : sourceDoc.select("*")) {
if (element.attr("style").length() > 50) {
String style = element.attr("style");
int begin = style.indexOf("url(\"");
if (begin <0 ) {
continue;
}
begin += 5;
int end = style.indexOf("\")");
String url = style.substring(begin,end);
String newUrl = getOssValue(url, articleId);
style = new StringBuffer(style).replace(begin, end, newUrl).toString();
element.attr("style", style);
}
}
}
public void parseVoiceElement(Element voiceElement, String articleId) {
String voiceURL = VOICE_URL + voiceElement.attr("voice_encode_fileid");
String newURL = getOssValue(voiceURL, articleId);
if(!StringUtils.isEmpty(newURL)) {
voiceElement.attr("voice_encode_fileid", newURL);
}
}
public void parseVideoElement(Element videoElement,String articleId) throws UnsupportedEncodingException {
String url = videoElement.attr("data-src");
if (StringUtils.isEmpty(url)) {
url = videoElement.attr("src");
}
String vid = url.substring(url.indexOf("&vid=") + 5);
if (vid.indexOf("&") > 0) {
vid = vid.substring(0, vid.indexOf("&") - 1);
}
String mpvid = videoElement.attr("data-mpvid");
if (StringUtils.isNotBlank(mpvid)) {
String videoUrl = getMpvidVideoUrl(mpvid);
String coverUrl = videoElement.attr("data-cover");
coverUrl = URLDecoder.decode(coverUrl, "UTF-8");
videoUrl = getOssValue(videoUrl,articleId);
coverUrl = getOssValue(coverUrl,articleId);
videoElement.after("<video poster=\"" + coverUrl + "\" src=\"" + videoUrl + "\" controls=\"controls\" width=\"100%\"></video>");
videoElement.remove();
} else if (StringUtils.isNotBlank(vid)) {
videoElement.after("<iframe frameborder=\"0\" src=\"https://v.qq.com/txp/iframe/player.html?vid=" + vid + "\" allowFullScreen=\"true\" style=\"width: 100%\"></iframe>");
videoElement.remove();
} else {
videoElement.attr("src",url);
}
videoElement.attr("data-src","");
videoElement.attr("style", videoElement.attr("style") + ";width: 100%;");
}
public String parseBackgroundImageURL(String style, String articleId) {
if(StringUtils.isEmpty(style) || style.indexOf("background-image: url(") == -1) {
return style;
}
style = style.replaceAll(""", "\"");
String regex = "background-image: url\\(\"(.*?)\"\\)";
Matcher m = Pattern.compile(regex).matcher(style);
StringBuffer sb = new StringBuffer();
while(m.find()) {
String url = m.group(1);
if (url.startsWith("//res.wx.qq.com")) {
url = "https:" + url;
}
String newURL = getOssValue(url,articleId);
if (!StringUtils.isEmpty(newURL)) {
String newValue = String.format("background-image: url(\"%s\")", newURL);
m.appendReplacement(sb, newValue);
} else {
log.info(">>>>>>>>爬取公众号文章背景图片转化失败, url={}", url);
}
}
m.appendTail(sb);
return sb.toString();
}
private String getMpvidVideoUrl(String mpvid) {
StringBuffer reqUrl = new StringBuffer("https://mp.weixin.qq.com/mp/videoplayer");
reqUrl.append("?action=get_mp_video_play_url").append("&vid=").append(mpvid)
.append("&uin=&key=&pass_ticket=&wxtoken=777&appmsg_token=&x5=0&f=json");
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
HttpGet httpGet = new HttpGet(reqUrl.toString());
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
HttpEntity responseEntity = response.getEntity();
if (responseEntity != null) {
String resultStr = EntityUtils.toString(responseEntity);
MpVideo mpVideo = JSONObject.parseObject(resultStr, MpVideo.class);
for (MpVideo.UrlInfoBean urlInfo : mpVideo.getUrl_info()) {
if (urlInfo.getFilesize() < 1024 * 1024 * 20) {
return urlInfo.getUrl();
}
if (Objects.equals(10004,urlInfo.getFormat_id())) {
return urlInfo.getUrl();
}
}
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (httpClient != null) {
httpClient.close();
}
if (response != null) {
response.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
boolean checkUrl(String url) {
if (StringUtils.isBlank(url)) {
return false;
}
url.replaceAll(" ","");
return url.startsWith("https://mp.weixin.qq.com/s") || url.startsWith("http://mp.weixin.qq.com/s");
}
private String getOssValue(String strUrl, String articleId){
try {
HttpURLConnection conn = null;
URL url = new URL(strUrl);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setConnectTimeout(10000);
final ByteArrayOutputStream output = new ByteArrayOutputStream();
Map<String,String> map = CosClient.uploadFile(conn.getInputStream());
return map.get("url");
} catch (MalformedURLException e) {
log.error(">>>>>>>>>上传图片失败报错:{}",e);
return "";
} catch (IOException e) {
log.error(">>>>>>>>>上传图片失败报错:{}",e);
return "";
}
}
}