WebCollector 是一个无须配置、便于二次开发的 JAVA 爬虫框架(内核),提供精简的的 API,只需少量代码即可实现一个功能强大的爬虫。
源码中集成了 Jsoup,可进行精准的网页解析,2.x 版本中集成了 selenium,可以处理 JavaScript 生成的数据。
操作浏览器:
谷歌浏览器chromedriver(使用时需要与本地浏览器版本一致):http://npm.taobao.org/mirrors/chromedriver/
无界面浏览器phantomjs:https://phantomjs.org/download.html
代码:
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; //采集wallhere public class CollectionWallhere { private static WebDriver driver; private static ChromeOptions options; private static String html; private static String type; public static String imgurl=null; static { // System.setProperty("webdriver.chrome.driver", "./drivers/chromedriver.exe"); // options= new ChromeOptions(); // Map<String, Object> prefs = new HashMap<String, Object>(); // prefs.put("profile.managed_default_content_settings.images", 2); // options.setExperimentalOption("prefs", prefs); // //=================↑禁止图片加载↑======================= // driver = new ChromeDriver(options); } public static void main(String[] args){ Scanner scanner =new Scanner(System.in); System.out.println("采集图片需要选择类型,如:动漫,风景,建筑,美女...."); //类型:动漫 = 页数:5105 System.out.println("请输入你需要采集的类型:"); String lx = scanner.next(); System.out.println("请输入你需要采集的页数:"); Integer page = scanner.nextInt(); new CollectionWallhere().wallhereRun(lx,page); } /** * 采集启动 */ public static void wallhereRun(String lx,Integer page){ WallhereConfig(); try { type = URLEncoder.encode(lx, "UTF-8" ); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } driver.get("https://wallhere.com/zh/wallpapers?q="+type);//访问 try { Thread.sleep(10 * 1000);//延时 } catch (InterruptedException e) { e.printStackTrace(); } html = driver.getPageSource();//获取网页代码 WallhereByImgByJson(page); } /** * 初始配置 */ public static void WallhereConfig(){ System.setProperty("webdriver.chrome.driver", "./drivers/chromedriver.exe"); options= new ChromeOptions(); Map<String, Object> prefs = new HashMap<String, Object>(); prefs.put("profile.managed_default_content_settings.images", 2); options.setExperimentalOption("prefs", prefs); //=================↑禁止图片加载↑======================= driver = new ChromeDriver(options); } /** * Wallhere图片采集 * @param page * @return */ public static void WallhereByImgByJson(Integer page){ HashMap<String, String> map =null; for (int i = 1; i <page+1 ; i++) { map = (HashMap<String, String>) getWallhereCookies();//刷新页面获取cookie List<String> listurl = getWallhereImgByUrl("https://wallhere.com/zh/wallpapers?q="+type+"&page="+i+"&format=json",map); for (String url:listurl) { imgurl = getWallhereByImgUrl(url,map);//获取到图片url System.out.println("下载:"+imgurl); new Thread() { @Override public void run() { downImages(System.getProperty("user.dir")+"/img",imgurl);//下载图片 } }.start(); } try { Thread.sleep(3 * 60 * 1000);//延时 } catch (InterruptedException e) { e.printStackTrace(); } } driver.quit();//结束关闭 } public static Map getWallhereCookies(){ driver.navigate().refresh();//刷新页面 try { Thread.sleep(5 * 1000);//延时 } catch (InterruptedException e) { e.printStackTrace(); } //获取cookie列表 Set<Cookie> cookies = driver.manage().getCookies(); HashMap<String, String> map = new HashMap<String,String>(); for (Cookie c:cookies) { map.put(c.getName(), c.getValue()); } System.out.println("获取cookie:"+map.toString()); return map; } /** * 获取一组图片页面地址 * @param url * @param map * @return */ public static List getWallhereImgByUrl(String url,Map map){ // String url = "https://wallhere.com/zh/wallpapers?q=%E5%8A%A8%E6%BC%AB&page=2&format=json"; Document doc = null; try { doc = Jsoup.connect(url) .header("Accept", "*/*") .header("Accept-Encoding", "gzip, deflate") .header("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3") .header("Content-Type", "application/json;charset=UTF-8") .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36") .cookies(map) .ignoreContentType(true) .postDataCharset("GBK") .timeout(60*1000) .get(); } catch (IOException e) { e.printStackTrace(); } Matcher m = Pattern.compile("<a href=\\\"\\\\"(.*?)\\\\"\\\"><img alt=").matcher(doc.html()); List<String> list = new LinkedList<>(); while (m.find()){ list.add("https://wallhere.com"+m.group(1).replace("\\/","/")); // System.out.println("https://wallhere.com"+m.group(1).replace("\\/","/")); } return list; } /** * 获取图片url * @return */ public static String getWallhereByImgUrl(String url,Map map){ Document doc = null; String imgurl=null; try { // String url = "https://wallhere.com/zh/wallpaper/1275327"; doc = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36") .cookies(map) .ignoreContentType(true) .postDataCharset("GBK") .timeout(60*1000) .get(); } catch (IOException e) { e.printStackTrace(); } // System.out.println(doc.html()); Matcher m = Pattern.compile("<meta name=\\\"twitter:image\\\" content=\\\"(.*?)\\\"> ").matcher(doc.html()); while (m.find()){ imgurl = m.group(1); } return imgurl; } /** * 下载图片到指定目录 * @param filePath 文件路径 * @param imgUrl 图片URL */ public static void downImages(String filePath, String imgUrl) { // 若指定文件夹没有,则先创建 File dir = new File(filePath); if (!dir.exists()) { dir.mkdirs(); } // 截取图片文件名 String fileName = imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length()); try { // 文件名里面可能有中文或者空格,所以这里要进行处理。但空格又会被URLEncoder转义为加号 String urlTail = URLEncoder.encode(fileName, "UTF-8"); // 因此要将加号转化为UTF-8格式的%20 imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } // 写出的路径 File file = new File(filePath + File.separator + fileName.replace("!d","")); BufferedOutputStream out=null; InputStream in=null; try { // 获取图片URL URL url = new URL(imgUrl); // 获得连接 URLConnection connection = url.openConnection(); // 设置延时相应时间 connection.setConnectTimeout(60 * 1000); connection.setReadTimeout(60 * 1000); //防止屏蔽程序抓取而返回403错误 connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); // 获得输入流 in = connection.getInputStream(); // 获得输出流 out = new BufferedOutputStream(new FileOutputStream(file)); // 构建缓冲区 byte[] buf = new byte[1024]; int size; // 写入到文件 while (-1 != (size = in.read(buf))) { out.write(buf, 0, size); } } catch (MalformedURLException e) { System.out.println("下载异常A"); // e.printStackTrace(); } catch (IOException e) { System.out.println("下载异常B"); // e.printStackTrace(); }finally { try { out.close(); in.close(); } catch (IOException e) { System.out.println("关闭下载流失败!"); } } } }
本代码需要配合chromedriver.exe浏览器使用。另外我下发一份打包好配合jdk1.8.x可直接使用的包。包里有chromedriver.exe浏览器可供开发使用。
发表评论