Web Crawler - The Java Programming language

Posted by geez_itsjustme (geez_itsjustme), 12 April 2005

Hi! I'm altering a web crawler code. How do I restrict the pages searched to the site domain of the initially supplied URL?

Secondly, how can restrict the number of queries to 5/sec?

Code:

public class Spider {

/**
* A collection of URLs that resulted in an error
*/
protected Collection workloadError = new ArrayList(3);

/**
* A collection of URLs that are waiting to be processed
*/
protected Collection workloadWaiting = new ArrayList(3);

/**
* A collection of URLs that were processed
*/
protected Collection workloadProcessed = new ArrayList(3);

/**
* A flag that indicates whether this process
* should be canceled
*/
protected boolean cancel = false;

public static final String DISALLOW = "Disallow:";

/**
* The constructor
*
* @param report A class that implements the ISpiderReportable
* interface, that will receive information that the
* spider finds.
*/
// public Spider(ISpiderReportable report)
// {
// this.report = report;
// }
public Spider()
{
}

/**
* Get the URLs that resulted in an error.
*
* @return A collection of URL's.
*/
public Collection getWorkloadError()
{
return workloadError;
}

/**
* Get the URLs that were waiting to be processed.
* You should add one URL to this collection to
* begin the spider.
*
* @return A collection of URLs.
*/
public Collection getWorkloadWaiting()
{
return workloadWaiting;
}

/**
* Get the URLs that were processed by this spider.
*
* @return A collection of URLs.
*/
public Collection getWorkloadProcessed()
{
return workloadProcessed;
}

/**
* Clear all of the workloads.
*/
public void clear()
{
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
}

/**
* Set a flag that will cause the begin
* method to return before it is done.
*/
public void cancel()
{
cancel = true;
}

/**
* Add a URL for processing.
*
* @param url
*/
public void addURL(URL url)
{
if ( getWorkloadWaiting().contains(url) )
return;
if ( getWorkloadError().contains(url) )
return;
if ( getWorkloadProcessed().contains(url) )
return;
// log("Adding to workload: " + url );
getWorkloadWaiting().add(url);
}

private void log (String str){
System.out.println(str);
}
/**
* Called internally to process a URL
*
* @param url The URL to be processed.
*/
public void processURL(URL url)
{
try {
log("Processing: " + url );
// get the URL's contents
URLConnection connection = url.openConnection();
connection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
connection.setRequestProperty("Accept-Language", "en-us");
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
connection.setRequestProperty("User-Agent", "naughtyOLA; Group OLA's");
connection.setRequestProperty("Host", "localhost:1234");
connection.setRequestProperty("Content-Length", "13");
connection.setRequestProperty("Connection", "Keep-Alive");
connection.setRequestProperty("Cache-Control", "no-cache");
// connection.setRequestProperty("Referer", "http://www.eastandard.net");
System.out.println("COONNN " +connection.getRequestProperty("User-Agent"));
if ( (connection.getContentType()!=null) &&
!connection.getContentType().toLowerCase().startsWith("text/") ) {
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Not processing because content type is: " +
connection.getContentType() );
return;
}
// can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0)
return;

// test to make sure it is before searching
if (!robotSafe(url))
return;

// read the URL
InputStream is = connection.getInputStream();
// parseWithRegexp(is);
parseIt(is, url);

} catch ( IOException e ) {
getWorkloadWaiting().remove(url);
getWorkloadError().add(url);
log("Error: " + url );
// report.spiderURLError(url);
return;
}
// mark URL as complete
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Complete: " + url );
//if (WorkloadProcessed = 5){pause for a sec;}
}

private void parseIt(InputStream is, URL url) throws IOException{
Reader r = new InputStreamReader(is);
// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r,new Parser(url),true);
}

/**
* Called to start the spider
*/
public void begin()
{
cancel = false;
int noPages=0;
while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
Object list[] = getWorkloadWaiting().toArray();
for ( int i=0;(i<list.length)&&!cancel;i++ ){
noPages++;
System.out.println();
System.out.println("---------------------------- "+noPages+ "---------------------------- ");
if (noPages>=5) {
System.out.println();
System.out.println("End of allowed pages");
return;
}
processURL((URL)list[i]);
}
}
}
public static void main(String[] args) {
String url="http://www.eastandard.net/";
try {
Spider spider = new Spider();
spider.clear();
URL base = new URL(url);
spider.addURL(base);
spider.begin();

} catch ( MalformedURLException e ) {
System.out.println("Malformed URL" + url);
}
}

Thanks,

lili lulu

Posted by admin (Graham Ellis), 12 April 2005

I would do in by calling the getHost method on your URL objects and seeing if they're equal, and by using a Thread.sleep for 1 second after every 5 pages gathered.

This page is a thread posted to the opentalk forum at www.opentalk.org.uk and archived here for reference. To jump to the archive index please follow this link.