java工具类:
去除HTML的标签的java代码如下:
public class HtmlFilterTagUtils {
private static final int subLength = 200; //截取字符串长度
private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>";// 定义script的正则表达式
private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>";// 定义style的正则表达式
private static final String regEx_html = "<[^>]+>";// 定义HTML标签的正则表达式
private static final String d_class = "<d class=[^>]*?>[\\s\\S]*?";// 定义SpanHTMLClass标签的正则表达式
private static final String a_class = "<a[^>]*?>[\\s\\S]*?";// 定义AHTMLClass标签的正则表达式
// private static final String span_class = "<span class\\s*[^>]*>(.*?)<\\/span>";// 定义HTMLClass标签的正则表达式
private static final String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符
private static final String regEx_enter = "(^|(\\r\\n|\\n))\\s*(\\r\\n|\\n)";
// private static final String regEx_enter = "(\n|\r\n)\\s+";//空白行
private static final String regEx_weixinJs = "<span data-shimo-docs=\"(.*?)\">";
public static String filter(String htmlStr) {
Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
Matcher m_space = p_space.matcher(htmlStr);
htmlStr = m_space.replaceAll(""); // 过滤空格回车标签
Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
Matcher m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 过滤script标签
Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 过滤style标签
String reg = ">\\s+([^\\s<]*)\\s+<";
htmlStr = htmlStr.replaceAll(reg, ">$1<");
htmlStr = htmlStr.replace("<p>””</p>", "");
htmlStr = htmlStr.replace("<br>", "<br>\n");
htmlStr = htmlStr.replace("</table>", "</table>\n").replace("</h1>", "</h1>\n").replace("</li>", "</li>\r\n")
.replace("</p>", "</p>\n").replace("</div>", "\n").replace("</br>", "</br>\n").replace("</i>", "</i>\n")
.replace("</tr>", "</tr>\r\n");
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
htmlStr = htmlStr.replace("”", " ").replace("“", " ").replace(" ", " ").replace("&ensp", " ").replace(" ", " "); // 过滤空格
htmlStr = htmlStr.replace(" ", "").replace(" ", "").replace("\\t", "").replace("\n\n", "\n").replace("\n\n.replace(\" \",\" \")", "\n").replace("\n\n", "\n");
Pattern p_enter = Pattern.compile(regEx_enter, Pattern.CASE_INSENSITIVE);
Matcher m_enter = p_enter.matcher(htmlStr);
htmlStr = m_enter.replaceAll(""); // 过滤空格回车标签
htmlStr = htmlStr.replace("\n", "\n\n").replace("\r\n\n", "\n");
return htmlStr; // 返回文本字符串3
}
public static String filterSpan(String htmlStr) {
Pattern p_html = Pattern.compile(d_class, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
return htmlStr; // 返回文本字符串
}
public static String filterA(String htmlStr) {
htmlStr = htmlStr.replaceAll(regEx_weixinJs, "");
return htmlStr;
}
/***
* 返回文本字符串 <div></d>进行特殊处理
* @param htmlStr
* @return
*/
public static String filterSpanContent(String htmlStr) {
Pattern p_html = Pattern.compile(d_class, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll("");
htmlStr = htmlStr.replace("</d>", "");
//   半个空白位   一个空白位 不断行的空白
htmlStr = htmlStr.replace("”", " ").replace("“", "\"").replace(" ", " ").replace("&ensp", " ").replace(" ", " "); // 过滤空格
return htmlStr;
}
public static String filterHtml(String htmlStr) {
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll("");
return htmlStr;
}
/***
* 替换掉微信特有的js代码
* @param htmlStr
* @return
*/
public static String filterWeixinJs(String htmlStr) {
htmlStr = htmlStr.replaceAll(regEx_weixinJs, "");
return htmlStr;
}
/****
* 获取图片的链接处理 <src
* @param html
* @return
*/
public static List<String> getImgSRC(String html) {
String regex = "src=\"(.*?)\"";
List<String> list = new ArrayList<>();
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(html);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/****
* 获取# 特殊字符识别的字符串 <src
* @param expression
* @return
*/
public static String getStrFromMark(String expression) {
String regex = "#(.*?)#";
List<String> list = new ArrayList<>();
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(expression);
String result = "";
while (ma.find()) {
result = ma.group();
break;
}
result = result.replace("#", "");
return result;
}
/****
* 获取# 特殊字符识别的字符串 <src
* @param expression
* @return
*/
public static List<String> getStrFromMarkList(String expression) {
String regex = "#(.*?)#";
List<String> list = new ArrayList<>();
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(expression);
String result = "";
while (ma.find()) {
result = ma.group();
list.add(result);
}
return list;
}
public static List<String> getImg(String html) {
String regex = "<(img|IMG).*?(\\>|\\/>|\\<\\/img>)";
List<String> list = new ArrayList<>();
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(html);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
public static List<String> getImgANDSRC(String html) {
String regex = "<img[\\s+]src=.*?(\\/>|\\<\\/img>)";
List<String> list = new ArrayList<>();
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(html);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
public static Boolean isPattern(String content, String regex) {
if(StringUtils.isBlank(content)){
return false;
}
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(content);
if (ma.find()) {
return true;
}
return false;
}
/****
* 排除掉单引号的影响
* @param title
* @return
*/
public static String getTitleNoMark(String title) {
title = title.replace("'", "\\'");
title = title.replace(",", "\\\\,");
return title;
}
}