转自:https://www.cnblogs.com/codingexperience/p/5319850.html
一、HttpClient简介
HttpClient是一个客户端的HTTP通信实现库,它不是一个浏览器。关于HTTP协议,可以搜索相关的资料。它设计的目的是发送与接收HTTP报文。它不会执行嵌入在页面中JavaScript代码,所以当需要抓取通过AJAX技术获取实际内容的页面时需要使用WebClient等其他开源库。HttpClient最新版已经到第5版,但已经稳定的应该是4.5.2版本,官方网址:。
二、HttpClient简单使用
HttpClient的主要用途是接收HTTP响应的内容,下面介绍HttpClient的简单使用,抓取博客园的首页。至于HttpClient4.5的常用API可以参考这篇文章:。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | package com.httpclient.demo; import java.io.IOException; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; public class SimpleHttpClient { // 使用HttpClient获取博客园首页 public static void main(String[] args) throws ClientProtocolException, IOException { String targetUrl = "http://www.cnblogs.com/" ; // 1.建立HttpClient对象 CloseableHttpClient client = HttpClients.createDefault(); // 2.建立Get请求 HttpGet get = new HttpGet(targetUrl); // 3.发送Get请求 CloseableHttpResponse res = client.execute(get); // 4.处理请求结果 if (res.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { HttpEntity entity = res.getEntity(); ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); String mimeType = contentType.getMimeType(); // 获取字节数组 byte [] content = EntityUtils.toByteArray(entity); if (charset == null ) { // 默认编码转成字符串 String temp = new String(content); String regEx = "(?=<meta).*?(?<=charset=[\\'|\\\"]?)([[a-z]|[A-Z]|[0-9]|-]*)" ; Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(temp); if (m.find() && m.groupCount() == 1 ) { charset = Charset.forName(m.group( 1 )); } else { charset = Charset.forName( "ISO-8859-1" ); } } System.out.println( new String(content, charset)); } } } |
三、HttpClient模拟登陆
HTTP协议本来是无状态的,但为了保持会话的状态,使用Cookie保存Session信息,当向服务器发送请求时会附加一些会话信息,从而能区分不同会话的状态。用户登陆过程,其实简单而言,就是首先验证用户名与密码,然后服务器生成会话信息保存到本地,最后用户凭借会话信息能够访问类似用户信息等需登陆的网页。
HttpClient4.5通过CookieStore保存用户的会话信息,还提供HttpClientContext保存用户连接的信息。下面是一个使用HttpClient模拟知乎登陆的简单案例。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | package com.httpclient.demo; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.apache.http.Consts; import org.apache.http.NameValuePair; import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.cookie.Cookie; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; /** * 模拟登陆知乎 */ public class ZhiHuTest { public static void main(String[] args) throws java.text.ParseException { String name = "username" ; String password = "password" // 全局请求设置 RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build(); // 创建cookie store的本地实例 CookieStore cookieStore = new BasicCookieStore(); // 创建HttpClient上下文 HttpClientContext context = HttpClientContext.create(); context.setCookieStore(cookieStore); // 创建一个HttpClient CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig) .setDefaultCookieStore(cookieStore).build(); CloseableHttpResponse res = null ; // 创建本地的HTTP内容 try { try { // 创建一个get请求用来获取必要的Cookie,如_xsrf信息 HttpGet get = new HttpGet( "http://www.zhihu.com/" ); res = httpClient.execute(get, context); // 获取常用Cookie,包括_xsrf信息 System.out.println( "访问知乎首页后的获取的常规Cookie:===============" ); for (Cookie c : cookieStore.getCookies()) { System.out.println(c.getName() + ": " + c.getValue()); } res.close(); // 构造post数据 List<NameValuePair> valuePairs = new LinkedList<NameValuePair>(); valuePairs.add( new BasicNameValuePair( "email" , name)); valuePairs.add( new BasicNameValuePair( "password" , password)); valuePairs.add( new BasicNameValuePair( "remember_me" , "true" )); UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8); entity.setContentType( "application/x-www-form-urlencoded" ); // 创建一个post请求 HttpPost post = new HttpPost( "https://www.zhihu.com/login/email" ); // 注入post数据 post.setEntity(entity); res = httpClient.execute(post, context); // 打印响应信息,查看是否登陆是否成功 System.out.println( "打印响应信息===========" ); HttpClientUtils.printResponse(res); res.close(); System.out.println( "登陆成功后,新的Cookie:===============" ); for (Cookie c : context.getCookieStore().getCookies()) { System.out.println(c.getName() + ": " + c.getValue()); } // 构造一个新的get请求,用来测试登录是否成功 HttpGet newGet = new HttpGet( "http://www.zhihu.com/question/following" ); res = httpClient.execute(newGet, context); String content = EntityUtils.toString(res.getEntity()); System.out.println( "登陆成功后访问的页面===============" ); System.out.println(content); res.close(); } finally { httpClient.close(); } } catch (IOException e) { e.printStackTrace(); } } } |