from email.Message import Message from email import message_from_string, message_from_file try: from email.header import decode_header except: from email.Header import decode_header import os import shutil import glob import tempfile import fcntl import re import datetime from BeautifulSoup import BeautifulSoup import sgmllib from htmlentitydefs import name2codepoint import sys from xml.sax.saxutils import escape # read and write to a file # assign an id? # A collection is stored in a directory # / # id # file with the last id used # /index directories used for indexing # /index/published Dir of soft links to entries, the link name is the published date-time # /index/updated Ditto but for 'updated' # /data # the mbox files with all the content # # Everything is converted into XHTML on the way in. # # All values except summary and content are # put into the headers. Summary and content are converted into # XHTML and placed into a single XHTML instance in the body of # the message, serialized as utf-8. illegal_xml_chars = re.compile(r"[\x01-\x08\x0B\x0C\x0E-\x1F]") namedref = re.compile(r'&([a-zA-Z][a-zA-Z0-9]*);') preserved_named_ent=['amp', 'lt', 'gt', 'apos', 'quot'] def invalidate(c): """ replace invalid characters """ return u'\N{REPLACEMENT CHARACTER}' % \ ('000' + hex(ord(c.group(0)))[2:])[-4:] def nr2c(value): """ convert named character references to characters """ value=value.group(1) if value not in preserved_named_ent: if value in name2codepoint: value=unichr(name2codepoint[value]) else: value=u"\N{REPLACEMENT CHARACTER}" else: value = '&%s;' % value return value def ncr2c(value): """ convert named character references to characters """ value=value.group(1) if value.startswith('x'): value=unichr(int(value[1:],16)) else: value=unichr(int(value)) return value def normalize(soup): dom = BeautifulSoup(soup, convertEntities="html") for tag in dom.findAll(True): dupattr = {} seen = {} for attr,value in tag.attrs: if attr in seen: dupattr[attr] = value value = sgmllib.charref.sub(ncr2c, value) value = illegal_xml_chars.sub(u'\uFFFD', value) tag[attr] = value seen[attr] = 1 # Removed duplicate attributes for attr, value in dupattr.iteritems(): print attr del tag[attr] tag[attr] = value text = illegal_xml_chars.sub(invalidate, unicode(dom)) text = sgmllib.charref.sub(ncr2c, text) text = namedref.sub(nr2c, text) return text def _text_only(title): if title: #title = BeautifulSoup(title, convertEntities="html") #return u''.join([e for e in title.recursiveChildGenerator() if isinstance(e, unicode)]) return title else: return u'' def create_path(*components): """Create the path from the components. Return True if the path did not exist.""" path = os.path.join(*components) if not os.path.exists(path): os.makedirs(path, mode=0755) return True return False def wrap(member): content = member['content'].encode("utf-8") summary = member['summary'].encode("utf-8") title = member['title'].encode("utf-8") member['content_full'] = "
%s %s
" % (summary, content) if summary: member['content_short'] = "
%s
" % summary else: member['content_short'] = member['content_full'] return member def simple_wrap(member): content = member['content'].encode("utf-8") summary = member['summary'].encode("utf-8") title = member['title'].encode("utf-8") member['summary_wrap'] = "
%s
" % summary member['content_wrap'] = "
%s
" % content return member # These are the only bits we really care about: # All values at these keys are presumed to be unicode strings. atom_reserved = ['content', 'summary'] re_unreserved = re.compile(r'[^a-zA-Z0-9]+') re_int = re.compile(r'^[0-9]+$') class Member(dict): """An extended dictionary that is used for members of collections The extended part allows you to get and set values as attributes. That is, d.fred is the same as d['fred'] In addition, appending an attrbute name with __wrap will return that content wrapped in an xhtml div. I.e. d.summary__wrap is the summary value wrapped in a div. There are two attributes available, content_full and content_short that return the 'right' choices of summary and content wrapped in an xhtml div. d.content_full """ def __getattr__(self, key): try: return self.__dict__[key] except KeyError: pass try: assert not key.startswith('_') if key.endswith("__wrap"): return u"
%s
" % self.__getitem__(key.rsplit("__wrap")[0]) elif key == "content_full": return u"
%s %s
" % (self.__getitem__("summary"), self.__getitem__("content")) elif key == "content_short": summary = self.__getitem__("summary") if summary: return u"
%s

...

" % summary else: return u"
%s
" % self.__getitem__("content") else: return self.__getitem__(key) except: raise AttributeError, "object has no attribute '%s'" % key def __setattr__(self, key, value): if key.startswith('_'): self.__dict__[key] = value else: return self.__setitem__(key, value) class Collection(object): def __init__(self, dir, trash=None): self.dir = dir self.trash = trash create_path(dir, "data") if create_path(dir, "index", "published"): self._re_index('published') if create_path(dir, "index", "updated"): self._re_index('updated') if not os.path.exists(os.path.join(dir, 'id')): self._re_write_idfile() def _re_write_idfile(self): f = file(os.path.join(self.dir, 'id'), "w+") ids = [int(name) for name in os.listdir(os.path.join(self.dir, "data")) if re_int.match(name)] if ids: max_id = max(ids)+1 else: max_id = 1 f.write(str(max_id)) f.close() def reindex(self): self._re_index('published') self._re_index('updated') self._re_write_idfile() def _get_meta(self, id, meta): filename = os.path.join(self.dir, "data", id) f = file(filename, "r") msg = message_from_file(f) f.close() return msg[meta] def _filename(self, id): return os.path.join(self.dir, "data", id) def _filename_rel(self, id): return os.path.join("..", "..", "data", id) def _index_link_name(self, id, meta): path = os.path.join(self.dir, "index", meta) return os.path.join(path, self._get_meta(id, meta) + "-" + id) def _index_link_name_dated(self, id, date, meta): path = os.path.join(self.dir, "index", meta) return os.path.join(path, date + "-" + id) def _re_index(self, meta): path = os.path.join(self.dir, "index", meta) for f in os.listdir(path): os.unlink(os.path.join(path, f)) for id in self.id_list(): os.symlink(self._filename_rel(id), self._index_link_name(id, meta)) def _msg_from_dict(self, member, old_id=None, remove_old_index=True): """Create a set of headers and a message body from the dictionary passed in. If the old_id is passed in then use it as a starting point, thus allowing old information dropped by the client to be preserved. """ filename = self._filename(old_id) # Put everything in a dict to avoid duplicate keys, which an email.Message allows msg = Member() if old_id and os.path.exists(filename): old_msg = message_from_file(file(filename, "r")) msg.update([(key, old_msg[key]) for key in old_msg.keys()]) if remove_old_index: # Removed old 'updated' link if present link_fn = self._index_link_name_dated(old_id, msg["updated"], "updated") if os.path.lexists(link_fn): os.unlink(link_fn) msg.update([(key, value) for (key, value) in member.iteritems() if key not in atom_reserved]) message = Message() for key, value in msg.iteritems(): message[key] = value # Perform magic on content and summary content = member.get('content', u'') content = normalize(content) summary = member.get('summary', u'') summary = normalize(summary) content = u"""%s %s""" % (summary, content) headers = message.as_string() headers = re.sub("\r(?!\n)|(?