<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
package com.lym.crawlerDemo;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.lym.mode.Picture;
/**
* 从 http://m.qqba.com/ 爬取美女图片
* @author Administrator
*
*/
public class CrawlerPicture {
public final static int STARTPAGE = 301;
public final static int ENDPAGE = 500;//爬取的页面数量
/**
* 获取图片的src和alt属性值
* @return
* @throws IOException
*/
public static List<Picture> getPictureUrl() throws IOException{
int number = 1;
List<Picture> pics = new ArrayList<Picture>();//存储获取到的所有图片的URL地址
for (int i = STARTPAGE; i < ENDPAGE; i++) {
String url = "http://m.qqba.com/people/list/"+i+".htm";
Document doc = null;
doc = Jsoup.connect(url).get(); //获取页面文档
Elements divList = doc.body().select("div.image-cell");
for (int j = 0; j < divList.size(); j++) {
Elements imgList = divList.get(j).select("img");//一个网页内所有的img标签
for (int k = 0; k < imgList.size(); k++) {
Picture pic = new Picture();
pic.setId(number++);
pic.setSrc(imgList.get(k).attr("src"));
pic.setAlt(imgList.get(k).attr("alt"));
pics.add(pic);
}
}
}
return pics;
}
/**
* 获取图片输入流
* @param picUrl 图片的URL地址
* @return
* @throws IOException
*/
public static InputStream getPictureInputStream(String picUrl) throws IOException{
URL url = new URL(picUrl);
DataInputStream dis = new DataInputStream(url.openStream());//获取图片的输入流
return dis;
}
/**
* 保存图片到本地磁盘中
* @param number 图片编号
* @throws IOException
*/
public static void savePicture(InputStream in, Picture pic) throws IOException{
String newImgUrl = "D:/picture/"+pic.getAlt()+"--"+pic.getId()+".jpg";//图片在磁盘上的存储路径
FileOutputStream fos = new FileOutputStream(new File(newImgUrl));
byte[] buf = new byte[1024];
int len = -1;
while( (len = in.read(buf)) >0){
fos.write(buf, 0, len);
}
fos.close();
}
/**
* 测试
* @param args
*/
public static void main(String[] args) {
try {
List<Picture> pics = getPictureUrl();
System.out.println("图片正在下载...");
for (int i = 0; i < pics.size(); i++) {
Picture pic = pics.get(i);
String picUrl = pic.getSrc();
InputStream in = getPictureInputStream(picUrl);
savePicture(in, pic);
in.close();
}
System.out.println("下载完成!");
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.lym.mode;
public class Picture {
/**
* 图片编号
*/
private int id;
/**
* 图片地址
*/
private String src;
/**
* 图片说明信息
*/
private String alt;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getSrc() {
return src;
}
public void setSrc(String src) {
this.src = src;
}
public String getAlt() {
return alt;
}
public void setAlt(String alt) {
this.alt = alt;
}
@Override
public String toString() {
return "Picture [id=" + id + ", src=" + src + ", alt=" + alt + "]";
}
}