0
点赞
收藏
分享

微信扫一扫

爬虫 笔记【自用】


HttpClient

Get

Get(无参)

public static void main(String[] args) {
//这是不带参数的
//第一个是HttpClient,搜索引擎
CloseableHttpClient httpClient = HttpClients.createDefault();
//第二个是HttpGet,url
org.apache.http.client.methods.HttpGet httpGet = new org.apache.http.client.methods.HttpGet("https://onlineweb.zhihuishu.com/onlinestuh5");
CloseableHttpResponse response = null;
//第三个是使用HttpClient发起请求,获得response 对象
try {
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String string = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(string.toString());
}
} catch (IOException ioException) {
ioException.printStackTrace();
}finally {
try {
response.close();
httpClient.close();
} catch (IOException ioException) {
ioException.printStackTrace();
}
}
//解析响应

}

Get(有参)

public static void main(String[] args) throws Exception {
//带参数的
/*创建*/
CloseableHttpClient client = HttpClients.createDefault();
/*创建uri*/
URIBuilder uriBuilder = new URIBuilder("https://t.bilibili.com/");
uriBuilder.setParameter("spm_id_from", "333.1007.0.0");
/*get*/
HttpGet httpGet = new HttpGet(uriBuilder.build());
System.out.println("访问的是:"+httpGet);
/*调用*/
try {
CloseableHttpResponse response = client.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String s = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(s.length());
}
} catch (IOException ioException) {
ioException.printStackTrace();
}
}

Post

Post无参

public static void main(String[] args) {
//这是不带参数的
//第一个是HttpClient,搜索引擎
CloseableHttpClient httpClient = HttpClients.createDefault();
//第二个是HttpGet,url
HttpPost httpPost = new HttpPost("https://www.bilibili.com/");
CloseableHttpResponse response = null;
//第三个是使用HttpClient发起请求,获得response 对象
try {
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode()==200)
{
String string = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(string.toString());
}
} catch (IOException ioException) {
ioException.printStackTrace();
}finally {
try {
response.close();
httpClient.close();
} catch (IOException ioException) {
ioException.printStackTrace();
}
}
}

Post有参 

public static void main(String[] args) throws Exception {
//这是不带参数的
//第一个是HttpClient,搜索引擎
CloseableHttpClient httpClient = HttpClients.createDefault();
//第二个是HttpGet,url
/*建一个表单*/
List<NameValuePair> list = new ArrayList<NameValuePair>();
list.add(new BasicNameValuePair("spm_id_from","333.1007.0.0"));
/*创建表单ENtity对象*/
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(list);

HttpPost httpPost = new HttpPost("https://www.bilibili.com/");
httpPost.setEntity(formEntity);
CloseableHttpResponse response = null;
//第三个是使用HttpClient发起请求,获得response 对象
try {
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode()==200)
{
String string = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(string.length());
}
} catch (IOException ioException) {
ioException.printStackTrace();
}finally {
try {
response.close();
httpClient.close();
} catch (IOException ioException) {
ioException.printStackTrace();
}
}
//解析响应

}

HttpClient连接池

public static void main(String[] args) {
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
/*设置最大连接数*/
cm.setMaxTotal(100);
/*设置主机连接数,也是网站的最大连接数*/
cm.setDefaultMaxPerRoute(10);
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm)
{
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("http://www.bilibili.com");
CloseableHttpResponse response = null;
try {
response = client.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200)
{
String s = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(s.length());
}
} catch (IOException ioException) {
ioException.printStackTrace();
}finally {
if(response!=null)
{
try {
response.close();
} catch (IOException ioException) {
ioException.printStackTrace();
}
}
}
}

JSoup

四个资源:

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!--Junit-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>

配置log4j.properties(最好在resources目录下)

log4j.rootLogger=debug, stdout, R

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout

# Pattern to output the caller's file name and line number.
log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n

log4j.appender.R=org.apache.log4j.RollingFileAppender
log4j.appender.R.File=example.log

log4j.appender.R.MaxFileSize=100KB
# Keep one backup file
log4j.appender.R.MaxBackupIndex=5

log4j.appender.R.layout=org.apache.log4j.PatternLayout
log4j.appender.R.layout.ConversionPattern=%p %t %c - %m%n

测试URL

@Test
public void testURl() throws Exception{
Document document = Jsoup.parse(new URL("http://www.baidu.com"), 10000);
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
}

选择器操作(部分) 和HTML挺像的

爬虫 笔记【自用】_搜索引擎

 

可能没时间搞了,先这样吧~

举报

相关推荐

算法笔记(自用)

JS笔记(自用)

链表笔记(自用)

java自用的笔记(1)

初识c指针笔记自用

JDBC笔记(自用未完结)

0 条评论