java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错 关键是在忽略https的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76"); 注意:需要加在new BufferedReader 前面才行,否则无效。 完整的HttpsUrlValidator.java代码
java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错
关键是在忽略https的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76");
注意:需要加在new BufferedReader 前面才行,否则无效。
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection = (HttpURLConnection) validationUrl.openConnection();
//first set User-Agent to solve Server returned HTTP response code: 403 for URL
connection.setRequestProperty("User-Agent", "Mozilla/4.76");
final BufferedReader in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
抓取的地方先调用忽略https的代码
//先调用下忽略https证书的再请求才可以
HttpsUrlValidator.retrieveResponseFromServer(url);
doc = Jsoup
.connect(url)
.header("User-Agent",rand_agents)
完整的HttpsUrlValidator.java代码如下:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;
public class HttpsUrlValidator {
static HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. "
+ session.getPeerHost());
return true;
}
};
public final static String retrieveResponseFromServer(final String url) {
HttpURLConnection connection = null;
try {
URL validationUrl = new URL(url);
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection = (HttpURLConnection) validationUrl.openConnection();
//first set User-Agent to solve Server returned HTTP response code: 403 for URL
connection.setRequestProperty("User-Agent", "Mozilla/4.76");
final BufferedReader in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
final StringBuffer stringBuffer = new StringBuffer(255);
synchronized (stringBuffer) {
while ((line = in.readLine()) != null) {
stringBuffer.append(line);
stringBuffer.append("\n");
}
return stringBuffer.toString();
}
} catch (final IOException e) {
System.out.println(e.getMessage());
return null;
} catch (final Exception e1){
System.out.println(e1.getMessage());
return null;
}finally {
if (connection != null) {
connection.disconnect();
}
}
}
public static void trustAllHttpsCertificates() throws Exception {
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
.getInstance("SSL");
sc.init(null, trustAllCerts, null);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
.getSocketFactory());
}
static class miTM implements javax.net.ssl.TrustManager,
javax.net.ssl.X509TrustManager {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
public void checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
}
}
大自然,飘然的风,QQ群: python技术交流群:453879716,人工智能深度学习群:251088643
golang技术交流群:316397059,vuejs技术交流群:458915921 ,有兴趣的可以加入