Posted by geez_itsjustme (geez_itsjustme), 12 April 2005
Hi! I'm altering a web crawler code. How do I restrict the pages searched to the site domain of the initially supplied URL?
Secondly, how can restrict the number of queries to 5/sec?
Code:public class Spider {
/** * A collection of URLs that resulted in an error */ protected Collection workloadError = new ArrayList(3);
/** * A collection of URLs that are waiting to be processed */ protected Collection workloadWaiting = new ArrayList(3);
/** * A collection of URLs that were processed */ protected Collection workloadProcessed = new ArrayList(3);
/** * A flag that indicates whether this process * should be canceled */ protected boolean cancel = false;
public static final String DISALLOW = "Disallow:";
/** * The constructor * * @param report A class that implements the ISpiderReportable * interface, that will receive information that the * spider finds. */ // public Spider(ISpiderReportable report) // { // this.report = report; // } public Spider() { }
/** * Get the URLs that resulted in an error. * * @return A collection of URL's. */ public Collection getWorkloadError() { return workloadError; }
/** * Get the URLs that were waiting to be processed. * You should add one URL to this collection to * begin the spider. * * @return A collection of URLs. */ public Collection getWorkloadWaiting() { return workloadWaiting; }
/** * Get the URLs that were processed by this spider. * * @return A collection of URLs. */ public Collection getWorkloadProcessed() { return workloadProcessed; }
/** * Clear all of the workloads. */ public void clear() { getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); }
/** * Set a flag that will cause the begin * method to return before it is done. */ public void cancel() { cancel = true; }
/** * Add a URL for processing. * * @param url */ public void addURL(URL url) { if ( getWorkloadWaiting().contains(url) ) return; if ( getWorkloadError().contains(url) ) return; if ( getWorkloadProcessed().contains(url) ) return; // log("Adding to workload: " + url ); getWorkloadWaiting().add(url); }
private void log (String str){ System.out.println(str); } /** * Called internally to process a URL * * @param url The URL to be processed. */ public void processURL(URL url) { try { log("Processing: " + url ); // get the URL's contents URLConnection connection = url.openConnection(); connection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*"); connection.setRequestProperty("Accept-Language", "en-us"); connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); connection.setRequestProperty("User-Agent", "naughtyOLA; Group OLA's"); connection.setRequestProperty("Host", "localhost:1234"); connection.setRequestProperty("Content-Length", "13"); connection.setRequestProperty("Connection", "Keep-Alive"); connection.setRequestProperty("Cache-Control", "no-cache"); // connection.setRequestProperty("Referer", "http://www.eastandard.net"); System.out.println("COONNN " +connection.getRequestProperty("User-Agent")); if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Not processing because content type is: " + connection.getContentType() ); return; } // can only search http: protocol URLs if (url.getProtocol().compareTo("http") != 0) return;
// test to make sure it is before searching if (!robotSafe(url)) return;
// read the URL InputStream is = connection.getInputStream(); // parseWithRegexp(is); parseIt(is, url);
} catch ( IOException e ) { getWorkloadWaiting().remove(url); getWorkloadError().add(url); log("Error: " + url ); // report.spiderURLError(url); return; } // mark URL as complete getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("Complete: " + url ); //if (WorkloadProcessed = 5){pause for a sec;} }
private void parseIt(InputStream is, URL url) throws IOException{ Reader r = new InputStreamReader(is); // parse the URL HTMLEditorKit.Parser parse = new HTMLParse().getParser(); parse.parse(r,new Parser(url),true); }
/** * Called to start the spider */ public void begin() { cancel = false; int noPages=0; while ( !getWorkloadWaiting().isEmpty() && !cancel ) { Object list[] = getWorkloadWaiting().toArray(); for ( int i=0;(i<list.length)&&!cancel;i++ ){ noPages++; System.out.println(); System.out.println("---------------------------- "+noPages+ "---------------------------- "); if (noPages>=5) { System.out.println(); System.out.println("End of allowed pages"); return; } processURL((URL)list[i]); } } } public static void main(String[] args) { String url="http://www.eastandard.net/"; try { Spider spider = new Spider(); spider.clear(); URL base = new URL(url); spider.addURL(base); spider.begin();
} catch ( MalformedURLException e ) { System.out.println("Malformed URL" + url); } }
|
|
Thanks,
lili lulu
Posted by admin (Graham Ellis), 12 April 2005
I would do in by calling the getHost method on your URL objects and seeing if they're equal, and by using a Thread.sleep for 1 second after every 5 pages gathered.
This page is a thread posted to the opentalk forum
at
www.opentalk.org.uk and
archived here for reference. To jump to the archive index please
follow
this link.