Java爬虫

首页 / 🍁编程类 / 正文

介绍

我只会目前爬点文章,图片什么的,别的不用想,一句话不会

需要用到的jar包,maven中xml文件

    <dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>


        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.1</version>
        </dependency>


        <dependency>
            <groupId>com.cloudhopper</groupId>
            <artifactId>ch-commons-io</artifactId>
            <version>2.3.0</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.13</version>
            <scope>test</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.7</version>
        </dependency>

    </dependencies>

下面就介绍几个最基本的使用方法和俩个测试案例(可以直接用的)

使用方法

    @Test
    public void testUrl() throws  Exception{
        //解析url地址,第一个参数为访问url,第二个是访问时候超时时间
        Document document=Jsoup.parse(new URL("http://99887766554433221100.cn/"),3000);

        //使用标题选择器
        String title=document.getElementsByTag("title").first().text();

        System.out.println("titile--->"+title);
    }


    @Test
    public void test()  throws  Exception{
        //id
        Document document=Jsoup.parse(new File("D:\\qianduan\\1\\index.html"),"utf-8");
        Element e_id=document.getElementById("xiaonanlashi");
        System.out.println("e_id---->"+e_id);
        System.out.println("e_id.test()---->"+e_id.text());

        //获取第一个span
        Element e_span_first=document.getElementsByTag("span").first();
        System.out.println("e_span_first---->"+e_span_first);
        System.out.println("e_span_first.test()---->"+e_span_first.text());

        //类元素
        Element e_class_first=document.getElementsByClass("class1").first();
        System.out.println("e_class_first---->"+e_class_first);
        System.out.println("e_class_first.test()---->"+e_class_first.text());


        //属性
        Element e_shuxing=document.getElementsByAttribute("abc").first();
        System.out.println("e_shuxing---->"+e_shuxing);
        System.out.println("e_shuxing.test()---->"+e_shuxing.text());

        //根据属性和属性的值
        Element e_shuxing_VALUE=document.getElementsByAttributeValue("href","http://99887766554433221100.cn/").first();
        System.out.println("e_shuxing_VALUE---->"+e_shuxing_VALUE);
        System.out.println("e_shuxing_VALUE.test()---->"+e_shuxing_VALUE.text());

    }


    @Test
    public void xiaonan() throws  Exception{
        Document document=Jsoup.parse(new File("D:\\qianduan\\1\\index.html"),"utf-8");
        //Elements es=document.select("span");
        //for (Element e:es){
        //    System.out.println("span--->"+ e.text());
        //}


        //用id选择器必须加#号
        //Elements es=document.select("#test");
        //for (Element e:es){
        //   System.out.println("id--->"+ e.text());
        //}

        //用class选择器必须加.
        Element e=document.select(".class1").first();
        System.out.println("class--->"+e.text());

        System.out.println(new Date());
    }

分享案例

自己纯手敲的

第一个

//链接分析
        //https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese
        //https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese&page=1
        //Element clss_ss=doc.select(".boxgrid>a>picture>img").first();
        System.out.println("开始时间为-->"+new Date());
        int id=1;
        for (int i=1;i<=999;i++){
            String url="https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese&page=";
            url+=i;
            Document doc=Jsoup.parse(new URL(url),2000);
            Elements imgs=doc.select(".boxgrid>a>picture>img");
            for (Element img:imgs){
                String img_url=img.attr("src");
                URL target=new URL(img_url);
                //URLConnection urlConnection=target.openConnection();
                //InputStream inputStream=urlConnection.getInputStream();
                //Exception in thread "main" java.io.IOException: Server returned HTTP response code: 403 for URL:
                //抓取失败,服务器禁止Java程序,使用下面三行代码就可以了

                HttpURLConnection httpConnection = (HttpURLConnection) new URL(img_url).openConnection();
                httpConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
                InputStream inputStream=httpConnection.getInputStream();
                System.out.println("已经下"+id+"张图片了");
                id++;
                OutputStream outputStream=new FileOutputStream("D:\\imgs\\"+id+".png");
                int flag=0;
                while ((flag=inputStream.read())!=-1){
                   outputStream.write(flag);
                }
                outputStream.close();
                inputStream.close();
            }
        }
        System.out.println("结束时间为-->"+new Date());

第二个案例

//链接分析
        //https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=1
        //https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=2

        //初步分析
        //Document doc=Jsoup.parse(new URL("https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=1"),5000);
        //Elements clss_ss=doc.select("._2gLVL _33Mfr");
        //Element clss_ss=doc.select("._33Mfr>a>img").first();
        //System.out.println(clss_ss.attr("src"));
        //alifei01.cfp.cn/creative/vcg/veer/612/veer-375537960.jpg
        System.out.println("开始时间为-->"+new Date());
        int id=1;
        for (int i=1;i<=78;i++){
            String url="https://www.photocome.com/search?q=%E9%A3%8E%E6%99%AF&page=";
            url+=i;
            Document doc=Jsoup.parse(new URL(url),5000);
            Elements imgs=doc.select("._33Mfr>a>img");
            for (Element img:imgs){
                String img_url=img.attr("src");
                img_url="https:"+img_url;
                //System.out.println(img_url);
                URL target=new URL(img_url);
                URLConnection urlConnection=target.openConnection();
                InputStream inputStream=urlConnection.getInputStream();
                System.out.println("已经下"+id+"张图片了");
                id++;
                OutputStream outputStream=new FileOutputStream("D:\\img\\"+id+".png");
                int flag=0;
                while ((flag=inputStream.read())!=-1){
                    outputStream.write(flag);
                }
                outputStream.close();
                inputStream.close();
            }

        }
        System.out.println("结束时间为-->"+new Date());
评论区
头像