Miscellaneous .NET tips, code, comments, and what-not.

Wednesday, March 23, 2005

Using HTML Tidy in .NET to screen scrape and create RSS/XML

A little code using HTML Tidy and XPath to screen scrape links and descriptions of those links from a web page. This technique probably will not work for every web site because HTML Tidy chokes on especially poor, standards incompliant web pages, but it should work for most "plainer" web pages with matching tags, good div tags. I would like to give a shout-out to O'Reilly's XML.Com pages and the wonderful XPath Explorer.






public void getSpurlLinks(string urlString, string styleSheet, string cachedFileName, string tagList)
{
//Spurl.Com has multiple page results, so I loop through 10, arbitrarily, to get a number of result links from their search
int numberOfPages = 10;
WebClient webClient = new WebClient();
ArrayList a = new ArrayList();
//a couple of file options for HTMLTidy
string optFile = @"c:\foo.tidy";
string errFile = @"c:\err.tidy";
byte[] reqHTML;
ArrayList theseLinks = new ArrayList();
for(int s=1;s<=numberOfPages;s++)
{
urlString = urlString + "&page=" + Convert.ToString(s);
reqHTML = webClient.DownloadData(urlString);
UTF8Encoding objUTF8 = new UTF8Encoding();
string myString = objUTF8.GetString(reqHTML);
//HTML Tidy uses the Document class for the following settings:
Document thisHTML = new Document();
thisHTML.LoadConfig( optFile );
thisHTML.SetErrorFile( errFile );
thisHTML.ParseString(myString);
thisHTML.CleanAndRepair();
thisHTML.RunDiagnostics();
thisHTML.SetOptBool( TidyOptionId.TidyForceOutput, 1 );
string fixedDoc = thisHTML.SaveString();
//I couldn't get HTML Tidy to put in the proper namespace designator, so here is a little hack:
fixedDoc = fixedDoc.Replace("xmlns=", "xmlns:html=");
StreamWriter sw = new StreamWriter(@"c:\xml\spurl.html", false);
sw.Write(fixedDoc);
sw.Flush();
sw.Close();
//These are the nodes I want from my newly created XHTML file:
string hrefXPath = "/html/body/div[@class='results']/div[@class='spurlResLink']/a";
string descXPath = "/html/body/div[@class='results']/div[@class='spurlResLink']/div[1]";
string titleXPath = "/html/body/div[@class='results']/div[@class='spurlResLink']/a";

XmlDocument myDoc = new XmlDocument();
XmlTextReader myRdr;

myRdr = new XmlTextReader(@"c:\xml\spurl.html");
myRdr.WhitespaceHandling = WhitespaceHandling.None;
try
{
myDoc.Load(myRdr);

//put the nodes into node collection for looping through
XmlNodeList thisNodes = myDoc.SelectNodes(hrefXPath);
XmlNodeList titleNodes = myDoc.SelectNodes(titleXPath);
XmlNodeList descNodes = myDoc.SelectNodes(descXPath);


int i=0;
foreach(XmlNode xn in thisNodes)
{
links l = new links(); //my own links class
l.LinkUrl = xn.Attributes.GetNamedItem("href").Value;

l.LinkId = i+990; //just some arbitrary link id #
if(titleNodes[i]==null||descNodes[i]==null)
{
l.LinkTitle = "";
l.LinkDescription = "";
}
else
{
l.LinkTitle = titleNodes[i].InnerText;
l.LinkDescription = descNodes[i].InnerText;

}

l.LinkTags = tagList;

i++;

theseLinks.Add(l);

}


myRdr.Close();


//bind my link collection to my DataGrid for presentation
spurlDG.DataSource = theseLinks;
spurlDG.DataBind();
}
catch (Exception goner)
{
myRdr.Close();

}
}
}

0 Comments:

Post a Comment

<< Home