0
点赞
收藏
分享

微信扫一扫

使用Java调用百度搜索(转)


search-demo托管于github

 

search-demo演示了如何利用Java来调用百度搜索和谷歌搜索,更多细节请到github上查看search-demo

 

自己没搜索引擎,又想要大规模的数据源,怎么办?可以对百度搜索和谷歌搜索善加利用,以小搏大,站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现,比如网站的新闻采集,比如技术、品牌的新闻跟踪,比如知识库的收集,比如人机问答系统等,我之前做的一个准确率达百分之九十几的人机问答系统的数据源,其中一部分就是充分利用了百度搜索和谷歌搜索。在此演示的技术的基础上,可以容易地扩展到其他的搜索引擎,可以借鉴使用的NekoHTML+XPath或JSoup+CSSPath技术,轻松获取页面的自定义的内容。

 

实现方式一:NekoHTML+XPath

 

Java代码 
        
    
1. package
2.   
3. import
4. import
5. import
6. import
7. import
8. import
9. import
10. import
11. import
12. import
13. import
14.   
15. import
16. import
17. import
18. import
19. import
20. import
21. import
22.   
23. import
24. import
25. import
26. import
27.   
28. public class NekoHTMLBaiduSearcher implements
29. private static final Logger LOG = LoggerFactory.getLogger(NekoHTMLBaiduSearcher.class);  
30.   
31. public
32. null;  
33. try
34. new
35. return
36. catch
37. "错误", e);  
38. finally
39. if (in != null) {  
40. try
41.                     in.close();  
42. catch
43. "错误", e);  
44.                 }  
45.             }  
46.         }  
47. return null;  
48.     }  
49.   
50. public
51. return parse(in, xpathExpression, "UTF-8");  
52.     }  
53.   
54. public
55. return parseMore(in, xpathExpression, "UTF-8");  
56.     }  
57.   
58. public
59. new
60. new
61. try
62. // 设置网页的默认编码
63.             parser.setProperty(  
64. "http://cyberneko.org/html/properties/default-encoding",  
65.                     encoding);  
66. /*
67.              * The Xerces HTML DOM implementation does not support namespaces
68.              * and cannot represent XHTML documents with namespace information.
69.              * Therefore, in order to use the default HTML DOM implementation
70.              * with NekoHTML's DOMParser to parse XHTML documents, you must turn
71.              * off namespace processing.
72.              */
73. "http://xml.org/sax/features/namespaces", false);  
74. new InputSource(new BufferedReader(new
75.             Document doc = parser.getDocument();  
76.             NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());  
77. for (int i = 0; i < products.getLength(); i++) {  
78.                 Node node = products.item(i);  
79.                 String title = node.getTextContent();  
80. new
81. "title", title);  
82. try
83. "href").getTextContent();  
84. "href", href);  
85. catch
86. "提取链接失败",e);  
87.                 }  
88.                 list.add(map);  
89.             }  
90. catch
91. "错误", e);  
92.         }  
93. return
94.     }  
95.   
96. public
97. new
98. new
99. try
100. // 设置网页的默认编码
101.             parser.setProperty(  
102. "http://cyberneko.org/html/properties/default-encoding",  
103.                     encoding);  
104. /*
105.              * The Xerces HTML DOM implementation does not support namespaces
106.              * and cannot represent XHTML documents with namespace information.
107.              * Therefore, in order to use the default HTML DOM implementation
108.              * with NekoHTML's DOMParser to parse XHTML documents, you must turn
109.              * off namespace processing.
110.              */
111. "http://xml.org/sax/features/namespaces", false);  
112. new InputSource(new BufferedReader(new
113.             Document doc = parser.getDocument();  
114.             NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());  
115. for (int i = 0; i < products.getLength(); i++) {  
116.                 Node node = products.item(i);  
117.                 list.add(node.getTextContent());  
118.             }  
119. catch
120. "错误", e);  
121.         }  
122. return
123.     }  
124.   
125. @Override
126. public
127. null;  
128. try
129. new
130. return
131. catch
132. "错误", e);  
133. finally
134. if (in != null) {  
135. try
136.                     in.close();  
137. catch
138. "错误", e);  
139.                 }  
140.             }  
141.         }  
142. return null;  
143.     }  
144.   
145. public
146. //保证只读一次
147. byte[] datas = Tools.readAll(in);  
148. if
149. try
150. "内容:" + new String(datas, "UTF-8"));  
151. catch
152. "错误", e);  
153.             }  
154.         }  
155.   
156. new
157.   
158. "//html/body/div/div/div/div[3]/p/span";  
159.         List<String> totals = parse(in, totalXpathExpression);  
160. int
161. int len = 10;  
162. if (totals != null && totals.size() == 1) {  
163. 0);  
164. int start = 10;  
165. if (str.indexOf("约") != -1) {  
166. 11;  
167.             }  
168. ",", "").replace("个", ""));  
169. "搜索结果数:"
170. else
171. return null;  
172.         }  
173. if (total < 1) {  
174. return null;  
175.         }  
176. if (total < 10) {  
177.             len = total;  
178.         }  
179. new
180. for (int i = 0; i < len; i++) {  
181. "";  
182. "";  
183. "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/h3/a";  
184. "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/div[1]";  
185. "titleXpathExpression:"
186. "summaryXpathExpression:"
187. //重新构造输入流
188. new
189.             List<String> titles = parse(in, titleXpathExpression);  
190.   
191. //重新构造输入流
192. new
193.             List<Map<String, String>> titleWithHrefs = parseMore(in, titleXpathExpression);  
194. for
195. "title");  
196. "href");  
197. " " + titleWithHref.get("href"));  
198. if (href != null) {  
199.                     content = Tools.getHTMLContent(href);  
200.                     url = href;  
201. else
202. "页面正确提取失败");  
203.                 }  
204.             }  
205.   
206. //重新构造输入流
207. new
208.             List<String> summaries = parse(in, summaryXpathExpression);  
209. //处理百度知道1
210. if (titles != null && titles.size() == 1 && (summaries == null
211. //重新构造输入流
212. new
213. "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[2]/div/div/p[2]";  
214. "baiduZhidao1XpathExpression:"
215.                 summaries = parse(in, baiduZhidao1XpathExpression);  
216.             }  
217. //处理百度知道2
218. if (titles != null && titles.size() == 1 && (summaries == null
219. //重新构造输入流
220. new
221. "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[2]";  
222. "baiduZhidao2XpathExpression:"
223.                 summaries = parse(in, baiduZhidao2XpathExpression);  
224.             }  
225. //处理百度文库
226. if (titles != null && titles.size() == 1 && (summaries == null
227. //重新构造输入流
228. new
229. "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[1]";  
230. "baiduWenkuXpathExpression:"
231.                 summaries = parse(in, baiduWenkuXpathExpression);  
232.             }  
233.   
234. if (titles != null && titles.size() == 1 && summaries != null && summaries.size() == 1) {  
235. new
236. 0));  
237.                 webpage.setUrl(url);  
238. 0));  
239.                 webpage.setContent(content);  
240.                 webpages.add(webpage);  
241. else
242. "获取搜索结果列表项出错:" + titles + " - "
243.             }  
244.         }  
245. if(webpages.size() < 10){              
246. //处理百度百科
247. "//html/body/div/div/div/div[3]/div[2]/div/h3/a";  
248. "//html/body/div/div/div/div[3]/div[2]/div/div/p";  
249. "处理百度百科 titleXpathExpression:"
250. "处理百度百科 summaryXpathExpression:"
251. //重新构造输入流
252. new
253.             List<String> titles = parse(in, titleXpathExpression);  
254. //重新构造输入流
255. new
256.             List<Map<String, String>> titleWithHrefs = parseMore(in, titleXpathExpression);  
257. "";  
258. "";  
259. for
260. "title");  
261. "href");  
262. " " + titleWithHref.get("href"));  
263. if (href != null) {  
264.                     content = Tools.getHTMLContent(href);  
265.                     url = href;  
266. else
267. "页面正确提取失败");  
268.                 }  
269.             }  
270. //重新构造输入流
271. new
272.             List<String> summaries = parse(in, summaryXpathExpression);  
273. if (titles != null && titles.size() == 1 && summaries != null && summaries.size() == 1) {  
274. new
275. 0));  
276.                 webpage.setUrl(url);  
277. 0));  
278.                 webpage.setContent(content);  
279.                 webpages.add(webpage);  
280. else
281. "获取搜索结果列表项出错:" + titles + " - "
282.             }  
283.         }  
284. if
285. return null;  
286.         }  
287. return
288.     }  
289.   
290. public static void
291. "http://www.baidu.com/s?pn=0&wd=杨尚川";  
292.           
293. new
294.         List<Webpage> webpages = searcher.search(url);  
295. if (webpages != null) {  
296. int i = 1;  
297. for
298. "搜索结果 " + (i++) + " :");  
299. "标题:"
300. "URL:"
301. "摘要:"
302. "正文:"
303. "");  
304.             }  
305. else
306. "没有搜索到结果");  
307.         }  
308.     }  
309. }

 

 

实现方式二:JSoup+CSSPath

 


Java代码 
        
    
1. package
2.   
3. import
4. import
5. import
6. import
7. import
8. import
9.   
10. import
11. import
12.   
13. public class JSoupBaiduSearcher implements
14. private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);  
15.   
16. @Override
17. public
18. new
19. try
20.             Document document = Jsoup.connect(url).get();  
21. "html body div#out div#in div#wrapper div#container.container_s p#page span.nums";  
22. "total cssQuery: "
23.             Element totalElement = document.select(cssQuery).first();  
24.             String totalText = totalElement.text();   
25. "搜索结果:"
26. int start = 10;  
27. if (totalText.indexOf("约") != -1) {  
28. 11;  
29.             }  
30. int total = Integer.parseInt(totalText.substring(start).replace(",", "").replace("个", ""));  
31. "搜索结果数:"
32. int len = 10;  
33. if (total < 1) {  
34. return null;  
35.             }  
36. if (total < 10) {  
37.                 len = total;  
38.             }  
39. for (int i = 0; i < len; i++) {  
40. "html body div#out div#in div#wrapper div#container.container_s div#content_left table#" + (i + 1) + ".result tbody tr td.c-default h3.t a";  
41. "html body div#out div#in div#wrapper div#container.container_s div#content_left table#" + (i + 1) + ".result tbody tr td.c-default div.c-abstract";  
42. "titleCssQuery:"
43. "summaryCssQuery:"
44.                 Element titleElement = document.select(titleCssQuery).first();  
45. "";  
46. "";  
47. if(titleElement != null){  
48.                     titleText = titleElement.text();  
49. "href");  
50. else{  
51. //处理百度百科
52. "html body div#out div#in div#wrapper div#container.container_s div#content_left div#1.result-op h3.t a";  
53. "html body div#out div#in div#wrapper div#container.container_s div#content_left div#1.result-op div p";  
54. "处理百度百科 titleCssQuery:"
55. "处理百度百科 summaryCssQuery:"
56.                     titleElement = document.select(titleCssQuery).first();  
57. if(titleElement != null){  
58.                         titleText = titleElement.text();  
59. "href");  
60.                     }  
61.                 }  
62.                 LOG.debug(titleText);  
63.                 Element summaryElement = document.select(summaryCssQuery).first();  
64. //处理百度知道
65. if(summaryElement == null){  
66. "div.c-abstract","font");  
67. "处理百度知道 summaryCssQuery:"
68.                     summaryElement = document.select(summaryCssQuery).first();  
69.                 }  
70. "";  
71. if(summaryElement != null){  
72.                     summaryText = summaryElement.text();   
73.                 }  
74.                 LOG.debug(summaryText);                  
75.                   
76. if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {  
77. new
78.                     webpage.setTitle(titleText);  
79.                     webpage.setUrl(href);  
80.                     webpage.setSummary(summaryText);  
81. if (href != null) {  
82.                         String content = Tools.getHTMLContent(href);  
83.                         webpage.setContent(content);  
84. else
85. "页面正确提取失败");  
86.                     }  
87.                     webpages.add(webpage);  
88. else
89. "获取搜索结果列表项出错:" + titleText + " - "
90.                 }  
91.             }  
92.               
93.               
94. catch
95. "搜索出错",ex);  
96.         }  
97. return
98.     }  
99.   
100. public static void
101. "http://www.baidu.com/s?pn=0&wd=杨尚川";  
102.           
103. new
104.         List<Webpage> webpages = searcher.search(url);  
105. if (webpages != null) {  
106. int i = 1;  
107. for
108. "搜索结果 " + (i++) + " :");  
109. "标题:"
110. "URL:"
111. "摘要:"
112. "正文:"
113. "");  
114.             }  
115. else
116. "没有搜索到结果");  
117.         }  
118.     }  
119. }

 

 



  • search-demo.rar (9.5 KB)
  • 描述: 2013-10-18
  • 下载次数: 40
  • search-demo-master-2013-10-23.zip (13.4 KB)
  • 描述: 2013-10-23
  • 下载次数: 5
举报

相关推荐

0 条评论