Training, Open Source computer languages
PerlPHPPythonMySQLApache / TomcatTclRubyJavaC and C++LinuxCSS 
Search for:
Home Accessibility Courses Diary The Mouth Forum Resources Site Map About Us Contact
Web Crawler

Posted by geez_itsjustme (geez_itsjustme), 12 April 2005
Hi! I'm altering a web crawler code. How do I restrict the pages searched to the site domain of the initially supplied URL?

Secondly, how can restrict the number of queries to 5/sec?

Code:
public class Spider {

 /**
  * A collection of URLs that resulted in an error
  */
 protected Collection workloadError = new ArrayList(3);

 /**
  * A collection of URLs that are waiting to be processed
  */
 protected Collection workloadWaiting = new ArrayList(3);

 /**
  * A collection of URLs that were processed
  */
 protected Collection workloadProcessed = new ArrayList(3);


 /**
  * A flag that indicates whether this process
  * should be canceled
  */
 protected boolean cancel = false;

 public static final String DISALLOW = "Disallow:";

 /**
  * The constructor
  *
  * @param report A class that implements the ISpiderReportable
  * interface, that will receive information that the
  * spider finds.
  */
//  public Spider(ISpiderReportable report)
//    {
//      this.report = report;
//    }
   public Spider()
     {
     }

 /**
  * Get the URLs that resulted in an error.
  *
  * @return A collection of URL's.
  */
 public Collection getWorkloadError()
 {
   return workloadError;
 }

 /**
  * Get the URLs that were waiting to be processed.
  * You should add one URL to this collection to
  * begin the spider.
  *
  * @return A collection of URLs.
  */
 public Collection getWorkloadWaiting()
 {
   return workloadWaiting;
 }

 /**
  * Get the URLs that were processed by this spider.
  *
  * @return A collection of URLs.
  */
 public Collection getWorkloadProcessed()
 {
   return workloadProcessed;
 }

 /**
  * Clear all of the workloads.
  */
 public void clear()
 {
   getWorkloadError().clear();
   getWorkloadWaiting().clear();
   getWorkloadProcessed().clear();
 }

 /**
  * Set a flag that will cause the begin
  * method to return before it is done.
  */
 public void cancel()
 {
   cancel = true;
 }

 /**
  * Add a URL for processing.
  *
  * @param url
  */
 public void addURL(URL url)
 {
   if ( getWorkloadWaiting().contains(url) )
     return;
   if ( getWorkloadError().contains(url) )
     return;
   if ( getWorkloadProcessed().contains(url) )
     return;
//    log("Adding to workload: " + url );
   getWorkloadWaiting().add(url);
 }

   private void log (String str){
       System.out.println(str);
   }
 /**
  * Called internally to process a URL
  *
  * @param url The URL to be processed.
  */
 public void processURL(URL url)
 {
   try {
     log("Processing: " + url );
     // get the URL's contents
     URLConnection connection = url.openConnection();
     connection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
     connection.setRequestProperty("Accept-Language", "en-us");
     connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
     connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
     connection.setRequestProperty("User-Agent", "naughtyOLA; Group OLA's");
     connection.setRequestProperty("Host", "localhost:1234");
     connection.setRequestProperty("Content-Length", "13");
     connection.setRequestProperty("Connection", "Keep-Alive");
     connection.setRequestProperty("Cache-Control", "no-cache");
  // connection.setRequestProperty("Referer", "http://www.eastandard.net");
     System.out.println("COONNN  " +connection.getRequestProperty("User-Agent"));
     if ( (connection.getContentType()!=null) &&
          !connection.getContentType().toLowerCase().startsWith("text/") ) {
       getWorkloadWaiting().remove(url);
       getWorkloadProcessed().add(url);
       log("Not processing because content type is: " +
            connection.getContentType() );
       return;
     }
       // can only search http: protocol URLs
       if (url.getProtocol().compareTo("http") != 0)
           return;

       // test to make sure it is before searching
       if (!robotSafe(url))
         return;



     // read the URL
     InputStream is = connection.getInputStream();
//      parseWithRegexp(is);
     parseIt(is, url);


   } catch ( IOException e ) {
     getWorkloadWaiting().remove(url);
     getWorkloadError().add(url);
     log("Error: " + url );
//      report.spiderURLError(url);
     return;
   }
   // mark URL as complete
   getWorkloadWaiting().remove(url);
   getWorkloadProcessed().add(url);
   log("Complete: " + url );
//if (WorkloadProcessed = 5){pause for a sec;}
 }



   private void parseIt(InputStream is, URL url) throws IOException{
       Reader r = new InputStreamReader(is);
     // parse the URL
     HTMLEditorKit.Parser parse = new HTMLParse().getParser();
     parse.parse(r,new Parser(url),true);
   }

    /**
  * Called to start the spider
  */
 public void begin()
 {
   cancel = false;
   int noPages=0;
   while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
     Object list[] = getWorkloadWaiting().toArray();
     for ( int i=0;(i<list.length)&&!cancel;i++ ){
         noPages++;
         System.out.println();
         System.out.println("---------------------------- "+noPages+ "---------------------------- ");
         if (noPages>=5) {
             System.out.println();
             System.out.println("End of allowed pages");
             return;
         }
         processURL((URL)list[i]);
     }
   }
 }
public static void main(String[] args)   {
    String url="http://www.eastandard.net/";
   try {
     Spider spider = new Spider();
     spider.clear();
     URL base = new URL(url);
     spider.addURL(base);
     spider.begin();

   } catch ( MalformedURLException e ) {
   System.out.println("Malformed URL" + url);
   }
 }



Thanks,

lili lulu

Posted by admin (Graham Ellis), 12 April 2005
I would do in by calling the getHost method on your URL objects and seeing if they're equal, and by using a Thread.sleep for 1 second after every 5 pages gathered.



This page is a thread posted to the opentalk forum at www.opentalk.org.uk and archived here for reference. To jump to the archive index please follow this link.

You can Add a comment or ranking to this page

© WELL HOUSE CONSULTANTS LTD., 2012: Well House Manor • 48 Spa Road • Melksham, Wiltshire • United Kingdom • SN12 7NY
PH: 01144 1225 708225 • FAX: 01144 1225 899360 • EMAIL: info@wellho.net • WEB: http://www.wellho.net • SKYPE: wellho