Csharp/C Sharp/Network/Web Crawler

Материал из .Net Framework эксперт
Перейти к: навигация, поиск

Build the DownloadString

<source lang="csharp">


using System; using System.IO; using System.Net; using System.Text.RegularExpressions; class MainClass {

   private static void Main() {
       string remoteUri = "http://www.apress.ru";
       WebClient client = new WebClient();
       string str = client.DownloadString(remoteUri);
       MatchCollection matches = Regex.Matches(str, @"http\S+[^-,;:?]\.gif");
       foreach (Match match in matches) {
           foreach (Group grp in match.Groups) {
               string file = grp.Value.Substring(grp.Value.LastIndexOf("/") + 1);
               try {
                   Console.WriteLine("Downloading {0} to file {1}", grp.Value, file);
                   client.DownloadFile(new Uri(grp.Value), file);
               } catch {
                   Console.WriteLine("Failed to download {0}", grp.Value);
               }
           }
       }
   }

}

      </source>


Check the ContentType

<source lang="csharp">

using System; using System.IO; using System.Net;

class HtmlDump {

    public static int Main(string[] astrArgs)
    {
         WebRequest webreq;
         WebResponse webres;
  
         try
         {
              webreq = WebRequest.Create("http://www.nfex.ru/");
              webres = webreq.GetResponse();
         }
         catch (Exception exc)
         {
              Console.WriteLine("HtmlDump: {0}", exc.Message);
              return 1;
         }
  
         if (webres.ContentType.Substring(0, 4) != "text")
         {
              Console.WriteLine("HtmlDump: URI must be a text type.");
              return 1;
         }
  
         Stream       stream = webres.GetResponseStream();
         StreamReader strrdr = new StreamReader(stream);
         string       strLine;
  
         while ((strLine = strrdr.ReadLine()) != null){
              Console.WriteLine(strLine);
         }
         stream.Close();
         return 0;
    }

}


      </source>


Create GetResponse from WebRequest

<source lang="csharp"> using System; using System.Net; using System.IO; using System.Drawing; using System.Windows.Forms; public class MainClass {

   public static void Main() {
       string picUri = "http://www.apress.ru/img/img05/Hex_RGB4.jpg";
       string htmlUri = "http://www.apress.ru";
       WebRequest requestPic = WebRequest.Create(picUri);
       WebRequest requestHtml = WebRequest.Create(htmlUri);
       WebResponse responsePic = requestPic.GetResponse();
       WebResponse responseHtml = requestHtml.GetResponse();
       Image img = Image.FromStream(responsePic.GetResponseStream());
       using (StreamReader r = new StreamReader(responseHtml.GetResponseStream())) {
           Console.WriteLine(r.ReadToEnd());
       }
   }

}

      </source>


Download a web page in a thread

<source lang="csharp"> using System; using System.Net; using System.Threading; class ThreadTest {

   static void Main() {
       new Thread(Download).Start();
       Console.WriteLine("download"s happening!");
       Console.ReadLine();
   }
   static void Download() {
       using (WebClient wc = new WebClient())
           try {
               wc.Proxy = null;
               wc.DownloadFile("http://www.google.ru", "index.html");
               Console.WriteLine("Finished!");
           } catch (Exception ex) {
           }
   }

}

      </source>


MiniCrawler: A skeletal Web crawler

<source lang="csharp"> /* C#: The Complete Reference by Herbert Schildt Publisher: Osborne/McGraw-Hill (March 8, 2002) ISBN: 0072134852

  • /

// MiniCrawler: A skeletal Web crawler.

using System; using System.Net; using System.IO;

public class MiniCrawler {

 // Find a link in a content string. 
 static string FindLink(string htmlstr,  
                        ref int startloc) { 
   int i; 
   int start, end; 
   string uri = null; 
   string lowcasestr = htmlstr.ToLower(); 

   i = lowcasestr.IndexOf("href=\"http", startloc); 
   if(i != -1) { 
     start = htmlstr.IndexOf(""", i) + 1; 
     end = htmlstr.IndexOf(""", start); 
     uri = htmlstr.Substring(start, end-start); 
     startloc = end; 
   } 
            
   return uri; 
 } 

 public static void Main(string[] args) { 
   string link = null; 
   string str; 
   string answer; 

   int curloc; // holds current location in response 

   if(args.Length != 1) { 
     Console.WriteLine("Usage: MiniCrawler <uri>"); 
     return ; 
   } 

   string uristr = args[0]; // holds current URI 

   try { 

     do { 
       Console.WriteLine("Linking to " + uristr); 

       /* Create a WebRequest to the specified URI. */
       HttpWebRequest req = (HttpWebRequest) 
              WebRequest.Create(uristr); 

       uristr = null; // disallow further use of this URI 

       // Send that request and return the response. 
       HttpWebResponse resp = (HttpWebResponse) 
              req.GetResponse(); 

       // From the response, obtain an input stream. 
       Stream istrm = resp.GetResponseStream(); 

       // Wrap the input stream in a StreamReader. 
       StreamReader rdr = new StreamReader(istrm); 

       // Read in the entire page. 
       str = rdr.ReadToEnd(); 

       curloc = 0; 
       
       do { 
         // Find the next URI to link to. 
         link = FindLink(str, ref curloc); 

         if(link != null) { 
           Console.WriteLine("Link found: " + link); 

           Console.Write("Link, More, Quit?"); 
           answer = Console.ReadLine(); 

           if(string.rupare(answer, "L", true) == 0) { 
             uristr = string.Copy(link); 
             break; 
           } else if(string.rupare(answer, "Q", true) == 0) { 
             break; 
           } else if(string.rupare(answer, "M", true) == 0) { 
             Console.WriteLine("Searching for another link."); 
           } 
         } else { 
           Console.WriteLine("No link found."); 
           break; 
         } 

       } while(link.Length > 0); 

       // Close the Response. 
       resp.Close(); 
     } while(uristr != null); 

   } catch(WebException exc) { 
     Console.WriteLine("Network Error: " + exc.Message +  
                       "\nStatus code: " + exc.Status); 
   } catch(ProtocolViolationException exc) { 
     Console.WriteLine("Protocol Error: " + exc.Message); 
   } catch(UriFormatException exc) { 
     Console.WriteLine("URI Format Error: " + exc.Message); 
   } catch(NotSupportedException exc) { 
     Console.WriteLine("Unknown Protocol: " + exc.Message); 
   } catch(IOException exc) { 
     Console.WriteLine("I/O Error: " + exc.Message); 
   } 

   Console.WriteLine("Terminating MiniCrawler."); 
 } 

}


      </source>


Output webpage content

<source lang="csharp">

using System.Net; using System; using System.IO; public class WebPagesApp {

   [STAThread]
   public static void Main(string[] args) {
       string s = "http://www.microsoft.ru";
       Uri uri = new Uri(s);
       WebRequest req = WebRequest.Create(uri);
       WebResponse resp = req.GetResponse();
       Stream str = resp.GetResponseStream();
       StreamReader sr = new StreamReader(str);
       string t = sr.ReadToEnd();
       int i = t.IndexOf("<HEAD>");
       int j = t.IndexOf("</HEAD>");
       string u = t.Substring(i, j);
       Console.WriteLine("{0}", u);
   }

}

      </source>


Set the BaseAddress for WebClient

<source lang="csharp"> using System; using System.Collections.Generic; using System.Text; using System.Net; class Program {

   static void Main(string[] args) {
       WebClient client = new WebClient();
       client.BaseAddress = "http://www.microsoft.ru";
       string data = client.DownloadString("Office");
       Console.WriteLine(data);
   }

}

      </source>