本文實例講述了JAVA過濾標簽實現將html內容轉換為文本的方法。分享給大家供大家參考,具體如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
/** * 把html內容轉為文本 * @param html 需要處理的html文本 * @param filterTags 需要保留的html標簽樣式 * @return */ public static String trimHtml2Txt(String html, String[] filterTags){ html = html.replaceAll( "\\<head>[\\s\\S]*?</head>(?i)" , "" ); //去掉head html = html.replaceAll( "\\<!--[\\s\\S]*?-->" , "" ); //去掉注釋 html = html.replaceAll( "\\<![\\s\\S]*?>" , "" ); html = html.replaceAll( "\\<style[^>]*>[\\s\\S]*?</style>(?i)" , "" ); //去掉樣式 html = html.replaceAll( "\\<script[^>]*>[\\s\\S]*?</script>(?i)" , "" ); //去掉js html = html.replaceAll( "\\<w:[^>]+>[\\s\\S]*?</w:[^>]+>(?i)" , "" ); //去掉word標簽 html = html.replaceAll( "\\<xml>[\\s\\S]*?</xml>(?i)" , "" ); html = html.replaceAll( "\\<html[^>]*>|<body[^>]*>|</html>|</body>(?i)" , "" ); html = html.replaceAll( "\\\r\n|\n|\r" , " " ); //去掉換行 html = html.replaceAll( "\\<br[^>]*>(?i)" , "\n\r" ); List<String> tags = new ArrayList<String>(); List<String> s_tags = new ArrayList<String>(); List<String> halfTag = Arrays.asList( new String[]{ "img" , "table" , "thead" , "th" , "tr" , "td" }); // if (filterTags != null && filterTags.length > 0 ){ for (String tag : filterTags) { tags.add( "<" +tag+(halfTag.contains(tag)? "" : ">" )); //開始標簽 if (! "img" .equals(tag)) tags.add( "</" +tag+ ">" ); //結束標簽 s_tags.add( "#REPLACETAG" +tag+(halfTag.contains(tag)? "" : "REPLACETAG#" )); //盡量替換為復雜一點的標記,以免與顯示文本混合,如:文本中包含#td、#table等 if (! "img" .equals(tag)) s_tags.add( "#REPLACETAG/" +tag+ "REPLACETAG#" ); } } html = StringUtils.replaceEach(html, tags.toArray( new String[tags.size()]), s_tags.toArray( new String[s_tags.size()])); html = html.replaceAll( "\\</p>(?i)" , "\n\r" ); html = html.replaceAll( "\\<[^>]+>" , "" ); html = StringUtils.replaceEach(html,s_tags.toArray( new String[s_tags.size()]),tags.toArray( new String[tags.size()])); html = html.replaceAll( "\\ " , " " ); return html.trim(); } |
希望本文所述對大家java程序設計有所幫助。