网页爬虫抓取URL简单实现 -

tanglong8848

浏览: 66969 次
性别:
来自: 北京

最近访客更多访客>>

wenfei852283049

zjutxujie

heaven孤城

LD_21

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (144)

社区版块

存档分类

网页爬虫抓取URL简单实现

package com.ogilvy.sayes.util;

import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;

/* 
 Description:     爬网页用 
 Author     :     long.tang
 */

public class SearchCrawler {

	public String myGetHttpFile2(String url) {

		String urlSource = url;
		StringBuffer htmlBuffer = new StringBuffer();
		String returnStr = null;
		try {
			InputStream imageSource = new URL(urlSource).openStream();
			int ch;
			while ((ch = imageSource.read()) > -1) {
				htmlBuffer.append((char) ch);
			}
			imageSource.close();
			returnStr = new String(htmlBuffer);
			returnStr = new String(returnStr.getBytes("ISO8859_1"), "GBK");
		} catch (Exception e) {
			System.out.println("error>>>>");
			e.printStackTrace();
		}

		//System.out.println("@@@:" + returnStr);
		if (returnStr != null) {
			return returnStr;
		} else {
			return "nothing";
		}

	}

	public void doit(String content, int depth) throws Exception {
		
		depth--;
		if (depth < 1) {
			//System.out.println("break::::");
			return;
		}

		SearchCrawler search = new SearchCrawler();
		ArrayList list = new ArrayList();
		int j = 0;
		String start = "href=";
		String end = "\"";
		String url = "";
		String type = "http";
		String[] urls;
		while (content.indexOf(start, j) > -1) {

				url = content.substring(content.indexOf(start, j) + 6, content.indexOf(end, content.indexOf(start, j) + 6));//+6 href="
				if (url.indexOf(type) > -1) {
					if (url.indexOf(".css") == -1&&url.indexOf(".ico") == -1&&url.indexOf(".exe") == -1) {
						System.out.println(url);
						
						list.add(url);

						if (list != null && list.size() > 0) {

							for (int k = 0; k < list.size(); k++) {
								doit(search.myGetHttpFile2(String.valueOf(list.get(k))), depth);
						
							}

						}
					}

				}

			
			j = content.indexOf(start, j) + 1;
			
		}

	}

	public static void main(String arg[]) {

		SearchCrawler search = new SearchCrawler();
		try {
			search.doit(search.myGetHttpFile2("http://www.2345.com/"),3);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

}

分享到：