#!/usr/bin/env python # coding: utf-8 # # autorss.py: Automatic RSS generator # # Copyright (c) 2007 Hye-Shik Chang # # This software is provided 'as-is', without any express or implied # warranty. In no event will the authors be held liable for any damages # arising from the use of this software. # # Permission is granted to anyone to use this software for any purpose, # including commercial applications, and to alter it and redistribute it # freely, subject to the following restrictions: # # 1. The origin of this software must not be misrepresented; you must not # claim that you wrote the original software. If you use this software # in a product, an acknowledgment in the product documentation would be # appreciated but is not required. # # 2. Altered source versions must be plainly marked as such, and must not be # misrepresented as being the original software. # # 3. This notice may not be removed or altered from any source # distribution. # from __future__ import division import os import sys import urllib import urlparse import difflib import re import math import time import datetime import uuid import unicodedata import difflib import hashlib from BeautifulSoup import BeautifulSoup, Tag from numpy import zeros from Bio.Cluster import treecluster import PyRSS2Gen reload(sys) # recover setdefaultencoding for PyRSS2Gen class URLDistanceMeter(object): def __init__(self, subs={}, indel={}, default_penalty=1): self.subs = subs self.indel = indel @staticmethod def build_penalty_table(digit, alpha, special, exception={}): tbl = {} for c in map(chr, range(256)): if c in exception: tbl[c] = exception[c] elif c.isdigit(): tbl[c] = digit elif c.isalpha(): tbl[c] = alpha elif ' ' <= c <= '\x7e': tbl[c] = special else: tbl[c] = 1 return tbl pat_indel = re.compile('\x00[+-]([^\x01]*)\x01') pat_subs = re.compile('\x00\^([^\x01]*)\x01') def score_part(self, dpart): d = 0 for indel in self.pat_indel.findall(dpart): d += sum(self.indel[c] for c in indel) for sub in self.pat_subs.findall(dpart): d += sum(self.subs[c] for c in sub) return d def __call__(self, v, w): vdif, wdif, different = difflib._mdiff([v], [w]).next() if not different: return 0 return self.score_part(vdif[1]) + self.score_part(wdif[1]) def http_url(url, baseurl): joined = urlparse.urljoin(baseurl, url) if joined.startswith('http://') or joined.startswith('https://'): return joined else: return None def extract_contents(node): if isinstance(node, unicode): return node return u''.join(extract_contents(child) for child in node.contents) def find_links(soup, listurl): for taginc in soup.findAll('a'): attrs = dict(taginc.attrs) url = http_url(attrs.get('href', 'about:blank'), listurl) if not url: continue linktext = extract_contents(taginc) yield (url, linktext) def build_distmatrix(urls, distfunc): distmatrix = zeros([len(urls)] * 2) for i, u1 in enumerate(urls): for j, u2 in enumerate(urls): if j >= i: break distmatrix[i, j] = distmatrix[j, i] = distfunc(u1[0], u2[0]) return distmatrix def build_clusters(urls, distmatrix, aggrsize=3, mindistwindow=1, bigleap=2): tree = treecluster(method='s', distancematrix=distmatrix) clusters = {} def digest(node): if node >= 0: urls = [node] uniq = 1 maxdist = 0 else: urls = clusters[node]['urls'] uniq = clusters[node]['unique'] maxdist = clusters[node]['maxdist'] return urls, uniq, maxdist # join clusters until enough aggregations form. for i, join in enumerate(tree): lefturls, leftuniq, leftdist = digest(join.left) righturls, rightuniq, rightdist = digest(join.right) nuniq = leftuniq + rightuniq if join.distance > 0 else 1 if (((leftuniq >= aggrsize and rightuniq >= aggrsize) or (len(lefturls) + len(righturls) == len(urls))) and join.distance >= mindistwindow): break # avoid join of well-aggregated cluster to quite different new member if ((leftdist or rightdist) and (leftdist + rightdist) * bigleap < join.distance): break clusters[-i - 1] = { 'urls': lefturls + righturls, 'maxdist': join.distance, 'unique': nuniq, } if join.left < 0: del clusters[join.left] if join.right < 0: del clusters[join.right] return clusters.values() def wcwidth(ustr): nwide = sum(1 for u in ustr if unicodedata.east_asian_width(u) in 'WAF') return nwide + len(ustr) class GammaScoreFunction(object): def __init__(self, k, theta, weight): self.k = k self.theta = theta self.kf = self.factorial(k) self.peak = self.gammapdf((k - 1) * theta) / weight @staticmethod def factorial(v): return reduce(lambda x, y: x * y, range(1, v + 1), 1) def gammapdf(self, x): return ((x ** (self.k - 1)) * math.exp(-x / self.theta) / (self.kf * self.theta**self.k)) def __call__(self, x): return self.gammapdf(x) / self.peak def pick_cluster(clusters, urls, score_length, score_csize, score_maxdist): for l in clusters: linktexts = [urls[urlno][1] for urlno in l['urls']] avgtextlength = sum(map(wcwidth, linktexts)) / len(linktexts) maxdistscore = score_maxdist(l['maxdist']) textlenscore = score_length(avgtextlength) csizescore = score_csize(l['unique']) l['score'] = maxdistscore + textlenscore + csizescore #print "CLUSTER", maxdistscore, l['maxdist'], textlenscore, csizescore #for urlno in l['urls']: # print ' ', urls[urlno][0], urls[urlno][1].encode('utf-8') return sorted(clusters, key=lambda l: l['score'])[-1] def html_escape(html): return html.replace('&', '&').replace('<', '<').replace('>', '>') def html_simplify(html): def simplify_contents(node): if isinstance(node, unicode): return html_escape(node.strip()) elif isinstance(node, Tag): if node.name in (u'br', u'p'): return u'
\n' elif node.name == u'img': return unicode(node) return u''.join(simplify_contents(child) for child in node.contents) soup = BeautifulSoup(html, fromEncoding='utf-8') return simplify_contents(soup) def diff_digest(left, right): leftdig = [] rightdig = [] splitter = '
' for dline in difflib.ndiff(left, right): if dline.startswith('-'): leftdig.append(dline[1:]) elif dline.startswith('+'): rightdig.append(dline[1:]) else: if leftdig and leftdig[-1] != splitter: leftdig.append(splitter) if rightdig and rightdig[-1] != splitter: rightdig.append(splitter) return html_simplify('\n'.join(leftdig)), html_simplify('\n'.join(rightdig)) class DocumentDiffCache(object): digestsuffix = '-digest' cacheencoding = 'utf-8' retrievedelay = 5 def __init__(self, topdir='./cache'): self.topdir = topdir if not os.path.isdir(topdir): os.mkdir(topdir) def get(self, url): cachepath = self.cachepath(url) if not os.path.exists(cachepath): time.sleep(self.retrievedelay) urllib.urlretrieve(url, cachepath) return open(cachepath) def cachepath(self, url): hashid = hashlib.sha1(url).hexdigest() return os.path.join(self.topdir, hashid) def diff(self, url1, url2): u1path = self.cachepath(url1) + self.digestsuffix u2path = self.cachepath(url2) + self.digestsuffix if os.path.exists(u1path) and os.path.exists(u2path): return [(open(path).read().decode(self.cacheencoding), os.path.getmtime(path)) for path in (u1path, u2path)] u1text, u2text = [ BeautifulSoup(self.get(url)).prettify().splitlines() for url in (url1, url2)] digest1, digest2 = diff_digest(u1text, u2text) if not os.path.exists(u1path): # don't overwrite to keep datetime open(u1path, 'w').write(digest1.encode(self.cacheencoding)) if not os.path.exists(u2path): open(u2path, 'w').write(digest2.encode(self.cacheencoding)) u1mtime = os.path.getmtime(u1path) u2mtime = os.path.getmtime(u2path) return (digest1, u1mtime), (digest2, u2mtime) def generate_digests(urls, cache=None): if len(urls) % 2 == 1: assert len(urls) > 2 # can't generate digest with only 1 url urls += [urls[0]] if cache is None: cache = DocumentDiffCache() r = [] for a, b in zip(urls[::2], urls[1::2]): r.extend(cache.diff(a, b)) return r def retrieve_webinfo(url): # Parameter Setting subs = URLDistanceMeter.build_penalty_table(0.1, 0.5, 2, {'/': 2, '%': 0.5}) indel = URLDistanceMeter.build_penalty_table(0.2, 1, 4, {'/': 6, '%': 0.8}) dist = URLDistanceMeter(subs, indel) score_length = GammaScoreFunction(3, 18, 1) score_csize = GammaScoreFunction(3, 5, 1) score_maxdist = GammaScoreFunction(3, 0.4, 1) cache = DocumentDiffCache() soup = BeautifulSoup(urllib.urlopen(url)) atitle = soup.find('head').find('title').contents[0] urls = list(find_links(soup, url)) distmatrix = build_distmatrix(urls, dist) clusters = build_clusters(urls, distmatrix) clstr = pick_cluster(clusters, urls, score_length, score_csize, score_maxdist) curls = [urls[uid][0] for uid in clstr['urls']] digests = generate_digests(curls, cache) return url, atitle, [(urls[uid][0], urls[uid][1], dig, mtm) for uid, (dig, mtm) in zip(clstr['urls'], digests)] def generate_rss(f, listurl, listtitle, editor, articles): # PyRSS2Gen doesn't handle unicode correctly. sys.setdefaultencoding('utf-8') timezone = datetime.timedelta(hours=9) # adjustment to GMT rssitems = [] for url, title, digest, mtime in articles: auid = str(uuid.uuid5(uuid.NAMESPACE_URL, url.encode('utf-8'))) rssitems.append( PyRSS2Gen.RSSItem( title = html_escape(title), link = url, description = digest, guid = PyRSS2Gen.Guid(auid, 0), pubDate = (datetime.datetime.fromtimestamp(mtime) - timezone), ) ) rss = PyRSS2Gen.RSS2( title = listtitle, link = listurl, description = 'Generated by AutoRSS on %s' % time.asctime(), lastBuildDate = datetime.datetime.now(), managingEditor = editor, items = rssitems ) rss.write_xml(f, 'utf-8') sys.setdefaultencoding('ascii') if __name__ == '__main__': RSSURL = ( 'http://ticket.interpark.com/Webzine/Paper/NoticeList.asp?' 'bbsno=34&KindOfGoods=TICKET' ) editor = 'Hye-Shik Chang ' aurl, atitle, articles = retrieve_webinfo(RSSURL) f = open('/home/perky/public_html/interpark.xml', 'w') generate_rss(f, aurl, atitle, editor, articles)