#!/usr/bin/env python
# coding: utf-8
#
# autorss.py: Automatic RSS generator
#
# Copyright (c) 2007 Hye-Shik Chang
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
# 1. The origin of this software must not be misrepresented; you must not
# claim that you wrote the original software. If you use this software
# in a product, an acknowledgment in the product documentation would be
# appreciated but is not required.
#
# 2. Altered source versions must be plainly marked as such, and must not be
# misrepresented as being the original software.
#
# 3. This notice may not be removed or altered from any source
# distribution.
#
from __future__ import division
import os
import sys
import urllib
import urlparse
import difflib
import re
import math
import time
import datetime
import uuid
import unicodedata
import difflib
import hashlib
from BeautifulSoup import BeautifulSoup, Tag
from numpy import zeros
from Bio.Cluster import treecluster
import PyRSS2Gen
reload(sys) # recover setdefaultencoding for PyRSS2Gen
class URLDistanceMeter(object):
def __init__(self, subs={}, indel={}, default_penalty=1):
self.subs = subs
self.indel = indel
@staticmethod
def build_penalty_table(digit, alpha, special, exception={}):
tbl = {}
for c in map(chr, range(256)):
if c in exception:
tbl[c] = exception[c]
elif c.isdigit():
tbl[c] = digit
elif c.isalpha():
tbl[c] = alpha
elif ' ' <= c <= '\x7e':
tbl[c] = special
else:
tbl[c] = 1
return tbl
pat_indel = re.compile('\x00[+-]([^\x01]*)\x01')
pat_subs = re.compile('\x00\^([^\x01]*)\x01')
def score_part(self, dpart):
d = 0
for indel in self.pat_indel.findall(dpart):
d += sum(self.indel[c] for c in indel)
for sub in self.pat_subs.findall(dpart):
d += sum(self.subs[c] for c in sub)
return d
def __call__(self, v, w):
vdif, wdif, different = difflib._mdiff([v], [w]).next()
if not different:
return 0
return self.score_part(vdif[1]) + self.score_part(wdif[1])
def http_url(url, baseurl):
joined = urlparse.urljoin(baseurl, url)
if joined.startswith('http://') or joined.startswith('https://'):
return joined
else:
return None
def extract_contents(node):
if isinstance(node, unicode):
return node
return u''.join(extract_contents(child) for child in node.contents)
def find_links(soup, listurl):
for taginc in soup.findAll('a'):
attrs = dict(taginc.attrs)
url = http_url(attrs.get('href', 'about:blank'), listurl)
if not url:
continue
linktext = extract_contents(taginc)
yield (url, linktext)
def build_distmatrix(urls, distfunc):
distmatrix = zeros([len(urls)] * 2)
for i, u1 in enumerate(urls):
for j, u2 in enumerate(urls):
if j >= i:
break
distmatrix[i, j] = distmatrix[j, i] = distfunc(u1[0], u2[0])
return distmatrix
def build_clusters(urls, distmatrix, aggrsize=3, mindistwindow=1, bigleap=2):
tree = treecluster(method='s', distancematrix=distmatrix)
clusters = {}
def digest(node):
if node >= 0:
urls = [node]
uniq = 1
maxdist = 0
else:
urls = clusters[node]['urls']
uniq = clusters[node]['unique']
maxdist = clusters[node]['maxdist']
return urls, uniq, maxdist
# join clusters until enough aggregations form.
for i, join in enumerate(tree):
lefturls, leftuniq, leftdist = digest(join.left)
righturls, rightuniq, rightdist = digest(join.right)
nuniq = leftuniq + rightuniq if join.distance > 0 else 1
if (((leftuniq >= aggrsize and rightuniq >= aggrsize) or
(len(lefturls) + len(righturls) == len(urls)))
and join.distance >= mindistwindow):
break
# avoid join of well-aggregated cluster to quite different new member
if ((leftdist or rightdist) and
(leftdist + rightdist) * bigleap < join.distance):
break
clusters[-i - 1] = {
'urls': lefturls + righturls,
'maxdist': join.distance,
'unique': nuniq,
}
if join.left < 0:
del clusters[join.left]
if join.right < 0:
del clusters[join.right]
return clusters.values()
def wcwidth(ustr):
nwide = sum(1 for u in ustr if unicodedata.east_asian_width(u) in 'WAF')
return nwide + len(ustr)
class GammaScoreFunction(object):
def __init__(self, k, theta, weight):
self.k = k
self.theta = theta
self.kf = self.factorial(k)
self.peak = self.gammapdf((k - 1) * theta) / weight
@staticmethod
def factorial(v):
return reduce(lambda x, y: x * y, range(1, v + 1), 1)
def gammapdf(self, x):
return ((x ** (self.k - 1)) * math.exp(-x / self.theta)
/ (self.kf * self.theta**self.k))
def __call__(self, x):
return self.gammapdf(x) / self.peak
def pick_cluster(clusters, urls, score_length, score_csize, score_maxdist):
for l in clusters:
linktexts = [urls[urlno][1] for urlno in l['urls']]
avgtextlength = sum(map(wcwidth, linktexts)) / len(linktexts)
maxdistscore = score_maxdist(l['maxdist'])
textlenscore = score_length(avgtextlength)
csizescore = score_csize(l['unique'])
l['score'] = maxdistscore + textlenscore + csizescore
#print "CLUSTER", maxdistscore, l['maxdist'], textlenscore, csizescore
#for urlno in l['urls']:
# print ' ', urls[urlno][0], urls[urlno][1].encode('utf-8')
return sorted(clusters, key=lambda l: l['score'])[-1]
def html_escape(html):
return html.replace('&', '&').replace('<', '<').replace('>', '>')
def html_simplify(html):
def simplify_contents(node):
if isinstance(node, unicode):
return html_escape(node.strip())
elif isinstance(node, Tag):
if node.name in (u'br', u'p'):
return u'
\n'
elif node.name == u'img':
return unicode(node)
return u''.join(simplify_contents(child) for child in node.contents)
soup = BeautifulSoup(html, fromEncoding='utf-8')
return simplify_contents(soup)
def diff_digest(left, right):
leftdig = []
rightdig = []
splitter = '
'
for dline in difflib.ndiff(left, right):
if dline.startswith('-'):
leftdig.append(dline[1:])
elif dline.startswith('+'):
rightdig.append(dline[1:])
else:
if leftdig and leftdig[-1] != splitter:
leftdig.append(splitter)
if rightdig and rightdig[-1] != splitter:
rightdig.append(splitter)
return html_simplify('\n'.join(leftdig)), html_simplify('\n'.join(rightdig))
class DocumentDiffCache(object):
digestsuffix = '-digest'
cacheencoding = 'utf-8'
retrievedelay = 5
def __init__(self, topdir='./cache'):
self.topdir = topdir
if not os.path.isdir(topdir):
os.mkdir(topdir)
def get(self, url):
cachepath = self.cachepath(url)
if not os.path.exists(cachepath):
time.sleep(self.retrievedelay)
urllib.urlretrieve(url, cachepath)
return open(cachepath)
def cachepath(self, url):
hashid = hashlib.sha1(url).hexdigest()
return os.path.join(self.topdir, hashid)
def diff(self, url1, url2):
u1path = self.cachepath(url1) + self.digestsuffix
u2path = self.cachepath(url2) + self.digestsuffix
if os.path.exists(u1path) and os.path.exists(u2path):
return [(open(path).read().decode(self.cacheencoding),
os.path.getmtime(path))
for path in (u1path, u2path)]
u1text, u2text = [
BeautifulSoup(self.get(url)).prettify().splitlines()
for url in (url1, url2)]
digest1, digest2 = diff_digest(u1text, u2text)
if not os.path.exists(u1path): # don't overwrite to keep datetime
open(u1path, 'w').write(digest1.encode(self.cacheencoding))
if not os.path.exists(u2path):
open(u2path, 'w').write(digest2.encode(self.cacheencoding))
u1mtime = os.path.getmtime(u1path)
u2mtime = os.path.getmtime(u2path)
return (digest1, u1mtime), (digest2, u2mtime)
def generate_digests(urls, cache=None):
if len(urls) % 2 == 1:
assert len(urls) > 2 # can't generate digest with only 1 url
urls += [urls[0]]
if cache is None:
cache = DocumentDiffCache()
r = []
for a, b in zip(urls[::2], urls[1::2]):
r.extend(cache.diff(a, b))
return r
def retrieve_webinfo(url):
# Parameter Setting
subs = URLDistanceMeter.build_penalty_table(0.1, 0.5, 2, {'/': 2, '%': 0.5})
indel = URLDistanceMeter.build_penalty_table(0.2, 1, 4, {'/': 6, '%': 0.8})
dist = URLDistanceMeter(subs, indel)
score_length = GammaScoreFunction(3, 18, 1)
score_csize = GammaScoreFunction(3, 5, 1)
score_maxdist = GammaScoreFunction(3, 0.4, 1)
cache = DocumentDiffCache()
soup = BeautifulSoup(urllib.urlopen(url))
atitle = soup.find('head').find('title').contents[0]
urls = list(find_links(soup, url))
distmatrix = build_distmatrix(urls, dist)
clusters = build_clusters(urls, distmatrix)
clstr = pick_cluster(clusters, urls, score_length, score_csize, score_maxdist)
curls = [urls[uid][0] for uid in clstr['urls']]
digests = generate_digests(curls, cache)
return url, atitle, [(urls[uid][0], urls[uid][1], dig, mtm)
for uid, (dig, mtm) in zip(clstr['urls'], digests)]
def generate_rss(f, listurl, listtitle, editor, articles):
# PyRSS2Gen doesn't handle unicode correctly.
sys.setdefaultencoding('utf-8')
timezone = datetime.timedelta(hours=9) # adjustment to GMT
rssitems = []
for url, title, digest, mtime in articles:
auid = str(uuid.uuid5(uuid.NAMESPACE_URL, url.encode('utf-8')))
rssitems.append(
PyRSS2Gen.RSSItem(
title = html_escape(title),
link = url,
description = digest,
guid = PyRSS2Gen.Guid(auid, 0),
pubDate = (datetime.datetime.fromtimestamp(mtime) -
timezone),
)
)
rss = PyRSS2Gen.RSS2(
title = listtitle,
link = listurl,
description = 'Generated by AutoRSS on %s' % time.asctime(),
lastBuildDate = datetime.datetime.now(),
managingEditor = editor,
items = rssitems
)
rss.write_xml(f, 'utf-8')
sys.setdefaultencoding('ascii')
if __name__ == '__main__':
RSSURL = (
'http://ticket.interpark.com/Webzine/Paper/NoticeList.asp?'
'bbsno=34&KindOfGoods=TICKET'
)
editor = 'Hye-Shik Chang '
aurl, atitle, articles = retrieve_webinfo(RSSURL)
f = open('/home/perky/public_html/interpark.xml', 'w')
generate_rss(f, aurl, atitle, editor, articles)