Java爬虫01-httpclient和jsoup

Httpclient

GET请求

百度一下

百度搜索林俊杰,爬取搜索结果页下载为html

1
2
3
4
5
6
7
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
</dependencies>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
    public static void main(String[] args) throws IOException {
CloseableHttpClient httpClient = HttpClients.createDefault();
//https://www.baidu.com/s?wd=林俊杰
HttpGet httpGet = new HttpGet("https://www.baidu.com/s?wd=%E6%9E%97%E4%BF%8A%E6%9D%B0");
//设置请求头UA,模拟浏览器行为
httpGet.addHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36");
CloseableHttpResponse response = httpClient.execute(httpGet);
StatusLine statusLine = response.getStatusLine();
System.out.println(statusLine.getStatusCode());

HttpEntity entity = response.getEntity();
InputStream is = entity.getContent();
File file = new File("./search.html");
byte[] bytes = new byte[1024];
FileOutputStream fileOutputStream = new FileOutputStream(file);
int ch = 0;
while((ch = is.read(bytes)) != -1){
fileOutputStream.write(bytes,0,ch);
}
is.close();
fileOutputStream.flush();
fileOutputStream.close();
is.close();

// String responseBody = EntityUtils.toString(entity, "utf-8");
// File file = new File("./search.html");
// FileOutputStream fileOutputStream = new FileOutputStream(file);
// fileOutputStream.write(responseBody.getBytes("utf-8"));
}

image-20240908223509742

POST请求

百度翻译

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
public static void main(String[] args) throws IOException {
String searchWord = "爬虫";
String defaultReqBody = "{\n" +
" \"from\": \"zh\",\n" +
" \"to\": \"en\",\n" +
" \"reference\": \"\",\n" +
" \"corpusIds\": [\n" +
" \n" +
" ],\n" +
" \"qcSettings\": [\n" +
" \"1\",\n" +
" \"2\",\n" +
" \"3\",\n" +
" \"4\",\n" +
" \"5\",\n" +
" \"6\",\n" +
" \"7\",\n" +
" \"8\",\n" +
" \"9\",\n" +
" \"10\",\n" +
" \"11\"\n" +
" ],\n" +
" \"needPhonetic\": false,\n" +
" \"domain\": \"common\",\n" +
" \"milliTimestamp\": 1725808602840\n" +
"}";
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost("https://fanyi.baidu.com/ait/text/translate");
JSONObject params = JSONObject.parseObject(defaultReqBody);
params.put("query",searchWord);
params.put("milliTimestamp",System.currentTimeMillis());
httpPost.setEntity(new StringEntity(params.toString(),"UTF-8"));
httpPost.addHeader("Content-Type", "application/json");

// https://www.baidu.com/s?wd=林俊杰 表单方式
// HttpPost httpPost = new HttpPost("https://fanyi.baidu.com/sug");
// List<BasicNameValuePair> formData = Arrays.asList(new BasicNameValuePair("kw", searchWord));
// httpPost.setEntity(new UrlEncodedFormEntity(formData));


System.out.println(params);
CloseableHttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
String result = EntityUtils.toString(entity);
System.out.println(result);
}

image-20240909233515742

Jsoup爬取数据

1
2
3
4
5
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
public static void main(String[] args) throws IOException {
URL url = new URL("https://www.jd.com/?cu=true");
Document document = Jsoup.parse(url, 5000);
Elements elements = document.getElementsByTag("title");
System.out.println(elements.get(0).text());

Elements liElements = document.getElementById("navitems-group1").getElementsByTag("li");
System.out.println(liElements.stream().map(e -> e.text()).collect(Collectors.toList()));

Elements select = document.select("#navitems-group1");
System.out.println(select.stream().map(e -> e.text()).collect(Collectors.toList()));
}
京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!
[企业会员, 大牌奥莱, 京东五金城, 进口好物]
[企业会员 大牌奥莱 京东五金城 进口好物]

image-20240910231627632

发现加载结果与实际访问结果不一致,说明京东做了一定的反爬或者前端实用Ajax异步加载,这些往往比较难以识别爬取,需要依赖框架或者浏览器驱动来模拟爬取

结论

​ 目前主流爬虫框架底层也是采用Httpclient和Jsoup来发起请求和解析HTML