《网络爬虫之获取图片到本地》由会员分享,可在线阅读,更多相关《网络爬虫之获取图片到本地(4页珍藏版)》请在金锄头文库上搜索。
1、/* Created on Aug 26, 2011 2:41:26 PM* HtmlSourceGetter.java* NOTICE OF PROPRIETARY RIGHTS* This program is a confidential trade secret and the property of author. Use, examination,* reproduction, disassembly, decompiling, transfer and/or disclosure to others of* all or any part of this software pro
2、gram are strictly prohibited except by express* written agreement with author.* -* Modification History* Date Author Version Description* Aug 26, 2011 Cross 1.0 New* -*/package com.cross.tools;import java.io.BufferedInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IO
3、Exception; import java.io.OutputStream; import .HttpURLConnection; import .URL;import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import or
4、g.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator;public class HtmlSourceGetter private static HttpURLConnection con = null;private static BufferedInputStream bis = null;private static OutputStream out = null;public static void getS
5、ource(String url) public static void parseHTML(String url, String keyword) private static void processNodeList(NodeList list, String keyword) public static void extractLinks(String url) try Parser parser = new Parser(url);parser.setEncoding(“UTF-8“);/ frame filterNodeFilter frameFilter = new NodeFil
6、ter() Overridepublic boolean accept(Node node) if(node.getText().startsWith(“frame src=“) return true;return false;/ image filter;NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);/ href filter;NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);/ link or image filter / OrFilter or
7、Filter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class);/ link or image or frame filter / OrFilter allFilter = new OrFilter(orFilter,frameFilter);NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);for (int i = 0; i tag / if(tag instanceof LinkTag) /
8、 LinkTag link = (LinkTag)tag; / String linkURL = link.getLink(); / String linkText = link.getLinkText(); / System.out.println(“linkURL:“+linkURL); / System.out.println(“linkText:“+linkText); / / tag / else if(tag instanceof ImageTag) ImageTag image = (ImageTag)tag;String imageURL = image.getImageURL
9、();String imageText = image.getText();System.out.println(“imageURL:“+imageURL);System.out.println(“imageText:“+imageText);con = (HttpURLConnection)(new URL(imageURL).openConnection();con.connect();bis = new BufferedInputStream(con.getInputStream(); out = new FileOutputStream(new File(“c:/cross/“ + i
10、 + “_“ +System.currentTimeMillis() +imageURL.substring(imageURL.lastIndexOf(“.“);byte buf = new byte1024;int size = 0;while(size = bis.read(buf) != -1)out.write(buf, 0, size); / out.flush();/ else / tag eg: / String frame = tag.getText(); / String frameURL = frame.split(“)1; / System.out.println(“fr
11、ameURL:“+frameURL); / / catch (Exception e) System.err.println(e.getStackTrace(); finally try out.close();bis.close();con.disconnect(); catch (IOException e) e.printStackTrace();public static void main(String args) / HtmlSourceGetter.parseHTML(“http:/localhost:8080/test/“, “); / HtmlSourceGetter.parseHTML(“http:/localhost:8080/test/“, “img“);HtmlSourceGetter.extractLinks(“http:/localhost:8080/“); / HtmlSourceGetter.extractLinks(“http:/localhost:8080/test/“);