| 54 | |
| 55 | == Scraping == |
| 56 | Scraping is reasonably simple as well. All of the real work is done in the loop at the bottom. It iterates over files and templates that match that file, finds any matches for the template, stores them in the ''data'' table, and then removes any entries from previous runs. |
| 57 | |
| 58 | There are currently exactly three types of templates: ''regex'', ''xpath'', and ''urlregex''. ''regex'' is obvious. You supply a regular expression, and any captures are stored as values for that match. ''xpath'' converts the file to an XML tree using lxml.etree.HTMLParser and then executes the xpath query on it. ''urlregex'' is just like ''regex'' except that it operates on the URL instead of the file contents. This was just added as an afterthought to be able to extract data from the URL. |
| 59 | |
| 60 | Another afterthought was manipulating the extracted data. This is done through a python string comprehension and the ''!FancyFormatter'' class, which provides access to multiple named values or does a url join on the base URL and a relative path. |
| 61 | |
| 62 | {{{ |
| 63 | #!python |
| 64 | import MySQLdb |
| 65 | import re |
| 66 | from StringIO import StringIO |
| 67 | from lxml import etree |
| 68 | import datetime |
| 69 | from urllib import basejoin |
| 70 | |
| 71 | db = MySQLdb.connect(user='user', passwd='passwd', host='host', db='db') |
| 72 | cursor = db.cursor() |
| 73 | |
| 74 | def xpath_search(content, query): |
| 75 | tree = etree.parse(StringIO(content), etree.HTMLParser()) |
| 76 | find = etree.XPath(query) |
| 77 | return find(tree) |
| 78 | |
| 79 | class FancyFormatter(object): |
| 80 | def __init__(self, dictionary): |
| 81 | self._dict = dictionary |
| 82 | |
| 83 | def __getitem__(self, item): |
| 84 | if item == 'makeurl': |
| 85 | return basejoin(self._dict['url'], self._dict['value']) |
| 86 | else: |
| 87 | return self._dict[item] |
| 88 | |
| 89 | def __str__(self): |
| 90 | return self._dict['value'] |
| 91 | |
| 92 | def add_tuple(meaning, url, value, format): |
| 93 | if format: |
| 94 | value = format % FancyFormatter({'url': url, 'value': value}) |
| 95 | cursor.execute('INSERT INTO data (created, meaning, url, value) VALUES (NOW(), %s, %s, %s)', |
| 96 | (meaning, url, value)) |
| 97 | |
| 98 | print 'Scraping...' |
| 99 | cursor.execute('SELECT url, content FROM files WHERE content IS NOT NULL') |
| 100 | for (url, content) in cursor: |
| 101 | start = datetime.datetime.now() |
| 102 | |
| 103 | templates = db.cursor() |
| 104 | templates.execute('SELECT type, pattern, meaning, format FROM templates WHERE %s LIKE url', (url,)) |
| 105 | for (type, pattern, meaning, format) in templates: |
| 106 | if type == 'xpath': |
| 107 | for value in xpath_search(content, pattern): |
| 108 | add_tuple(meaning, url, value, format) |
| 109 | elif type == 'regex': |
| 110 | for value in re.search(pattern, content, re.S|re.M): |
| 111 | add_tuple(meaning, url, value, format) |
| 112 | elif type == 'urlregex': |
| 113 | match = re.search(pattern, url, re.S|re.M) |
| 114 | if match: |
| 115 | for value in match.groups(): |
| 116 | add_tuple(meaning, url, value, format) |
| 117 | else: |
| 118 | raise RuntimeError('Unknown template type: "%s".' % (type,)) |
| 119 | cursor.execute('DELETE FROM data WHERE url=%s AND created<%s', (url, start)) |
| 120 | db.commit() |
| 121 | print 'Done.' |
| 122 | }}} |