介绍
我只会目前爬点文章,图片什么的,别的不用想,一句话不会
需要用到的jar包,maven中xml文件
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>com.cloudhopper</groupId>
<artifactId>ch-commons-io</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
</dependencies>
下面就介绍几个最基本的使用方法和俩个测试案例(可以直接用的)
使用方法
@Test
public void testUrl() throws Exception{
//解析url地址,第一个参数为访问url,第二个是访问时候超时时间
Document document=Jsoup.parse(new URL("http://99887766554433221100.cn/"),3000);
//使用标题选择器
String title=document.getElementsByTag("title").first().text();
System.out.println("titile--->"+title);
}
@Test
public void test() throws Exception{
//id
Document document=Jsoup.parse(new File("D:\\qianduan\\1\\index.html"),"utf-8");
Element e_id=document.getElementById("xiaonanlashi");
System.out.println("e_id---->"+e_id);
System.out.println("e_id.test()---->"+e_id.text());
//获取第一个span
Element e_span_first=document.getElementsByTag("span").first();
System.out.println("e_span_first---->"+e_span_first);
System.out.println("e_span_first.test()---->"+e_span_first.text());
//类元素
Element e_class_first=document.getElementsByClass("class1").first();
System.out.println("e_class_first---->"+e_class_first);
System.out.println("e_class_first.test()---->"+e_class_first.text());
//属性
Element e_shuxing=document.getElementsByAttribute("abc").first();
System.out.println("e_shuxing---->"+e_shuxing);
System.out.println("e_shuxing.test()---->"+e_shuxing.text());
//根据属性和属性的值
Element e_shuxing_VALUE=document.getElementsByAttributeValue("href","http://99887766554433221100.cn/").first();
System.out.println("e_shuxing_VALUE---->"+e_shuxing_VALUE);
System.out.println("e_shuxing_VALUE.test()---->"+e_shuxing_VALUE.text());
}
@Test
public void xiaonan() throws Exception{
Document document=Jsoup.parse(new File("D:\\qianduan\\1\\index.html"),"utf-8");
//Elements es=document.select("span");
//for (Element e:es){
// System.out.println("span--->"+ e.text());
//}
//用id选择器必须加#号
//Elements es=document.select("#test");
//for (Element e:es){
// System.out.println("id--->"+ e.text());
//}
//用class选择器必须加.
Element e=document.select(".class1").first();
System.out.println("class--->"+e.text());
System.out.println(new Date());
}
分享案例
自己纯手敲的
第一个
//链接分析
//https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese
//https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese&page=1
//Element clss_ss=doc.select(".boxgrid>a>picture>img").first();
System.out.println("开始时间为-->"+new Date());
int id=1;
for (int i=1;i<=999;i++){
String url="https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese&page=";
url+=i;
Document doc=Jsoup.parse(new URL(url),2000);
Elements imgs=doc.select(".boxgrid>a>picture>img");
for (Element img:imgs){
String img_url=img.attr("src");
URL target=new URL(img_url);
//URLConnection urlConnection=target.openConnection();
//InputStream inputStream=urlConnection.getInputStream();
//Exception in thread "main" java.io.IOException: Server returned HTTP response code: 403 for URL:
//抓取失败,服务器禁止Java程序,使用下面三行代码就可以了
HttpURLConnection httpConnection = (HttpURLConnection) new URL(img_url).openConnection();
httpConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
InputStream inputStream=httpConnection.getInputStream();
System.out.println("已经下"+id+"张图片了");
id++;
OutputStream outputStream=new FileOutputStream("D:\\imgs\\"+id+".png");
int flag=0;
while ((flag=inputStream.read())!=-1){
outputStream.write(flag);
}
outputStream.close();
inputStream.close();
}
}
System.out.println("结束时间为-->"+new Date());
第二个案例
//链接分析
//https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=1
//https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=2
//初步分析
//Document doc=Jsoup.parse(new URL("https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=1"),5000);
//Elements clss_ss=doc.select("._2gLVL _33Mfr");
//Element clss_ss=doc.select("._33Mfr>a>img").first();
//System.out.println(clss_ss.attr("src"));
//alifei01.cfp.cn/creative/vcg/veer/612/veer-375537960.jpg
System.out.println("开始时间为-->"+new Date());
int id=1;
for (int i=1;i<=78;i++){
String url="https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=";
url+=i;
Document doc=Jsoup.parse(new URL(url),5000);
Elements imgs=doc.select("._33Mfr>a>img");
for (Element img:imgs){
String img_url=img.attr("src");
img_url="https:"+img_url;
//System.out.println(img_url);
URL target=new URL(img_url);
URLConnection urlConnection=target.openConnection();
InputStream inputStream=urlConnection.getInputStream();
System.out.println("已经下"+id+"张图片了");
id++;
OutputStream outputStream=new FileOutputStream("D:\\img\\"+id+".png");
int flag=0;
while ((flag=inputStream.read())!=-1){
outputStream.write(flag);
}
outputStream.close();
inputStream.close();
}
}
System.out.println("结束时间为-->"+new Date());
文章全部是本人原创,请勿转发,谢谢配合,版权所有-南香香-你会喜欢我吗
Java爬虫 - 南香香
rjnpxjkppe http://www.gf5kcen96m4923vr3j168t7ww7i0h3w8s.org/
arjnpxjkppe
[url=http://www.gf5kcen96m4923vr3j168t7ww7i0h3w8s.org/]urjnpxjkppe[/url]
Java爬虫 - 南香香
[url=http://www.gwlsta7o4049l7n4cwm07px53q3726w6s.org/]uvowmshrjl[/url]
avowmshrjl
vowmshrjl http://www.gwlsta7o4049l7n4cwm07px53q3726w6s.org/