Monday 24 March 2014

How to get Raw HTML from a Page : System.Net.WebResponse

Retrieving a page's HTML can sometimes be necessary in applications. You could be filling columns for actual meta data of sites or may retrieving remote web sites for specific content. It's require very simple code to do all the stuff

// pass the site URL
string strHTML = Strip(Request(strWebURL + "/" + file.Url, "GET"));

//The Request Function
String Request(String uri, String verb)       {

            WebRequest request = WebRequest.Create(uri);
            request.ContentType = "application/x-www-form-urlencoded";
            request.Method = verb;
            request.ContentLength = 0;
            WebResponse response = request.GetResponse();
            if (response == null)
            return null;
            StreamReader reader = new StreamReader(response.GetResponseStream());
            return reader.ReadToEnd().Trim();
        }

// The Strip function, to remove style, scripts & Html tags from the text

 public string Strip(string text)
        {
            text = Regex.Replace(text, "", "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            text = Regex.Replace(text, "", "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            text = Regex.Replace(text, @"<(.|\n)*?>", string.Empty);
            text = text.Replace("\r\n\t", " ").Replace("\n", " ").Replace("\r", " ").Replace("\t", " ");
            return text;
        }

// That's All

No comments:

Post a Comment