{

Mark Pilgrim's excellent Dive Into Python has a section on using SGMLParser and having seen nothing similar (and imagining its many uses!) I thought I'd give it a whirl in IronPython. I thought a good proof of concept would be creating a database out of link heavy sites. Since I visit Arts & Letters Daily every so often and the closet intellectual in me likes to hang onto what I find there, I thought I'd target it:

import urllib2
import sgmllib
from sgmllib import SGMLParser

import clr
clr.AddReference("System.Data")
clr.AddReference("System.Net")
from System import *
from System.Data import *
from System.Net import *

class AlReader(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.urls = []
        self.pieces = []
        self.track = 0
        self.prePend = "No Category"
        self.counter = 0
        
    def start_a(self, attrs):
        href = [v for k,v in attrs if k == "href"]
        key = [v for k,v in attrs if k == "name"]
        if href:
            self.urls.extend(href)
            self.track = 1
        elif key:
            self.prePend = attrs[0][1]
    
    def handle_data(self, text):
        if self.track:
            self.pieces.append("|".join([self.prePend, text]))
            self.counter = self.counter + 1
    
    def end_a(self):
        self.track = 0
    
    def get_links(self):
        links = []
        for i in range(0, len(self.urls)):
            links.append("|".join([self.pieces[i], self.urls[i]]))
        return links
        #print "%s %s" % (self.counter, "Total links") 
    
    def get_link_datatable(self):
        d = DataTable()
        d.Columns.Add(DataColumn("Category", Type.GetType("System.String")))
        d.Columns.Add(DataColumn("Site", Type.GetType("System.String")))
        d.Columns.Add(DataColumn("Url", Type.GetType("System.String")))
        
        for text in self.get_links():
            newRow = d.NewRow()
            newRow["Category"], newRow["Site"], newRow["Url"] = text.split("|")
            d.Rows.Add(newRow)

        return d
        
response = urllib2.urlopen("http://www.aldaily.com")
a = AlReader()
a.feed(response.read())
linkdata = a.get_link_datatable()
# write it out to prove we got it.
ds = DataSet()
ds.Tables.Add(linkdata)
ds.WriteXml("c:\\temp\\arts and letters links.xml")

If you find tihs interesting do make sure you look at Pilgrim's chapter on HTML Processing.

Using SGMLParser With IronPython

{

}