在使用 Java 进行网络爬虫开发时,Apache HttpClient 是一个经典且强大的选择。相比原生 URL 连接,它提供了更完善的连接管理和协议支持。下面是一个简单的实现示例,展示了如何通过封装接口来统一处理 GET 和 POST 请求。
核心实现代码如下,该类实现了自定义的 ICrawl 接口,负责具体的抓取逻辑:
package com.simple.crawImpl;
import com.simple.Icrawl.ICrawl;
import com.simple.pojos.CrawlResultPojo;
import com.simple.pojos.UrlPojo;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
public class HttpClientCrawlerImpl implements ICrawl {
public CloseableHttpClient httpClient = HttpClients.custom().build();
@Override
public CrawlResultPojo crawl(UrlPojo urlpojo) {
if (urlpojo == null) return ;
();
;
;
{
(urlpojo.getUrl());
response = httpClient.execute(httpGet);
response.getEntity();
(entity.getContent(), );
br = (isr);
;
();
((line = br.readLine()) != ) {
context.append(line + );
}
crawlResultPojo.setSuccess();
crawlResultPojo.setPageContent(context.toString());
crawlResultPojo;
} (IOException e) {
e.printStackTrace();
crawlResultPojo.setSuccess();
} {
{
(br != ) br.close();
(response != ) response.close();
} (IOException e) {
e.printStackTrace();
}
}
crawlResultPojo;
}
CrawlResultPojo {
(urlPojo == || urlPojo.getUrl() == ) ;
();
;
{
RequestBuilder.post().setUri( (urlPojo.getUrl()));
Map<String, Object> parasMap = urlPojo.getParasMap();
(parasMap != ) {
(Entry<String, Object> entry : parasMap.entrySet()) {
rb.addParameter(entry.getKey(), entry.getValue().toString());
}
}
rb.build();
httpClient.execute(httpUriRequest).getEntity();
(entity.getContent(), );
br = (isr);
;
();
((line = br.readLine()) != ) {
stringBuilder.append(line + );
}
crawlResultPojo.setPageContent(stringBuilder.toString());
crawlResultPojo.setSuccess();
crawlResultPojo;
} (URISyntaxException e) {
e.printStackTrace();
} (ClientProtocolException e) {
e.printStackTrace();
} (IOException e) {
e.printStackTrace();
} {
{
(br != ) br.close();
} (IOException e) {
e.printStackTrace();
}
}
crawlResultPojo.setSuccess();
crawlResultPojo;
}
{
();
;
(url);
Map<String, Object> parasMap = <>();
;
parasMap.put(, );
parasMap.put(, );
parasMap.put(, );
urlPojo.setParasMap(parasMap);
httpClientCrawlerImpl.crawl4Post(urlPojo);
print(resultPojo);
resultPojo = httpClientCrawlerImpl.crawl(urlPojo);
print(resultPojo);
}
{
System.out.println(s);
}
}

