"""Extract list of URLs in a web page

This program is part of "Dive Into Python", a free Python book for
experienced programmers.  Visit http://diveintopython.org/ for the
latest version.
"""

#$Id: urllister.py 378 2006-02-10 16:55:59Z mikel $

__author__ = "Mark Pilgrim (mark@diveintopython.org)"
__version__ = "$Revision: 378 $"
__date__ = "$Date: 2006-02-10 17:55:59 +0100 (or., 10 ots 2006) $"
__copyright__ = "Copyright (c) 2001 Mark Pilgrim"
__license__ = "Python"

from sgmllib import SGMLParser

class URLLister(SGMLParser):
	def reset(self):
		SGMLParser.reset(self)
		self.urls = []

	def start_a(self, attrs):
		href = [v for k, v in attrs if k=='href']
		if href:
			self.urls.extend(href)

if __name__ == "__main__":
	import urllib
	usock = urllib.urlopen("http://diveintopython.org/")
	parser = URLLister()
	parser.feed(usock.read())
	parser.close()
	usock.close()
	for url in parser.urls: print url
