SpiderBasedHttpRequest.java
4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
package com.yoho.search.spider.common;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/**
* Created by ginozhang on 2017/2/28.
*/
@Component
public class SpiderBasedHttpRequest {
private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
/**
* 最大重试次数
*/
private static final int MAX_RETRY_TIMES = 2;
public BaikeBO get(String url) {
int times = 1;
BaikeBO baikeBO = null;
while (times <= MAX_RETRY_TIMES) {
try {
if ((baikeBO = doGet(url)) != null) {
return baikeBO;
}
} catch (Exception e) {
LOGGER.error(url, e);
}
times++;
}
return baikeBO;
}
private BaikeBO doGet(String url) throws Exception {
String response = sendGet(url, "");
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(response);
TagNode[] titles = tagNode.getElementsByName("title", true);
String title = "";
if (titles == null || titles.length == 0 || (title = titles[0].getText().toString()).equals("百度百科——全球最大中文百科全书")) {
return null;
}
String summary = "";
Object[] summaryNodes = tagNode.evaluateXPath("//div[@class='lemma-summary']");
if (summaryNodes != null && summaryNodes.length != 0) {
for (Object obja : summaryNodes) {
TagNode tna = (TagNode) obja;
summary = summary + " " + tna.getText().toString();
}
}
String content = "";
Object[] contentNodes = tagNode.evaluateXPath("//div[@class='main-content']");
if (contentNodes != null && contentNodes.length != 0) {
for (Object obja : contentNodes) {
TagNode tna = (TagNode) obja;
content = content + " " + tna.getText().toString();
}
}
title = title.replaceAll("_百度百科", "");
return new BaikeBO(url, title, summary, content);
}
/**
* 向指定URL发送GET方法的请求
*
* @param url 发送请求的URL
* @param param 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return URL 所代表远程资源的响应结果
*/
private String sendGet(String url, String param) throws Exception {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0)");
// 建立实际的连接
connection.connect();
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(), "UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
public static void main(String[] args) throws XPatherException {
SpiderBasedHttpRequest service = new SpiderBasedHttpRequest();
String url = "http://baike.baidu.com/item/vans";
BaikeBO baikeBO = service.get(url);
System.out.println(baikeBO);
url = "http://baike.baidu.com/item/vans%20ad";
baikeBO = service.get(url);
System.out.println(baikeBO);
}
}