爬虫:java版

爬虫:java版
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
</dependency>

package com.lym.crawlerDemo;

 

import java.io.DataInputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.URL;

import java.util.ArrayList;

import java.util.List;

 

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.select.Elements;

 

import com.lym.mode.Picture;

 

/**

 * 从  http://m.qqba.com/ 爬取美女图片

 * @author Administrator

 *

 */

public class CrawlerPicture {

 

	public final static int STARTPAGE = 301;

	public final static int ENDPAGE = 500;//爬取的页面数量

	

	/**

	 * 获取图片的src和alt属性值

	 * @return

	 * @throws IOException

	 */

	public static List<Picture> getPictureUrl() throws IOException{

		int number = 1;

		List<Picture> pics = new ArrayList<Picture>();//存储获取到的所有图片的URL地址

		for (int i = STARTPAGE; i < ENDPAGE; i++) {

			String url = "http://m.qqba.com/people/list/"+i+".htm";

			Document doc = null;

			doc = Jsoup.connect(url).get();		//获取页面文档

			Elements divList = doc.body().select("div.image-cell");

			for (int j = 0; j < divList.size(); j++) {

				Elements imgList = divList.get(j).select("img");//一个网页内所有的img标签

				for (int k = 0; k < imgList.size(); k++) {

					Picture pic = new Picture();

					pic.setId(number++);

					pic.setSrc(imgList.get(k).attr("src"));

					pic.setAlt(imgList.get(k).attr("alt"));

					

					pics.add(pic);

				}

			}

		}

		return pics;

	}

	

	

	/**

	 * 获取图片输入流

	 * @param picUrl  图片的URL地址

	 * @return

	 * @throws IOException 

	 */

	public static InputStream getPictureInputStream(String picUrl) throws IOException{

		URL url = new URL(picUrl);

		DataInputStream dis = new DataInputStream(url.openStream());//获取图片的输入流

		return dis;

	}

	

	/**

	 * 保存图片到本地磁盘中

	 * @param number 图片编号

	 * @throws IOException 

	 */

	public static void savePicture(InputStream in, Picture pic) throws IOException{

		String newImgUrl = "D:/picture/"+pic.getAlt()+"--"+pic.getId()+".jpg";//图片在磁盘上的存储路径

		FileOutputStream fos = new FileOutputStream(new File(newImgUrl));

		byte[] buf = new byte[1024];

		int len = -1;

		while( (len = in.read(buf)) >0){

			fos.write(buf, 0, len);

		}

		fos.close();

	}

	

	

	

	

	

	/**

	 * 测试

	 * @param args

	 */

	public static void main(String[] args) {

		try {

			List<Picture> pics = getPictureUrl();

			System.out.println("图片正在下载...");

			for (int i = 0; i < pics.size(); i++) {

				Picture pic = pics.get(i);

				String picUrl = pic.getSrc();

				InputStream in = getPictureInputStream(picUrl);

				savePicture(in, pic);

				in.close();

			}

			System.out.println("下载完成!");

		} catch (IOException e) {

			e.printStackTrace();

		}

	}

}

package com.lym.mode;

 

public class Picture {

 

	/**

	 * 图片编号

	 */

	private int id;

	/**

	 * 图片地址

	 */

	private String src;

	/**

	 * 图片说明信息

	 */

	private String alt;

	

	public int getId() {

		return id;

	}

	public void setId(int id) {

		this.id = id;

	}

	public String getSrc() {

		return src;

	}

	public void setSrc(String src) {

		this.src = src;

	}

	public String getAlt() {

		return alt;

	}

	public void setAlt(String alt) {

		this.alt = alt;

	}

	@Override

	public String toString() {

		return "Picture [id=" + id + ", src=" + src + ", alt=" + alt + "]";

	}

	

}