el2atom

#!/usr/bin/env python2
# coding: UTF-8
#
# Copyright © 2014, David McIntosh <dmcintosh@df12.net>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided this permission
# notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.

"""
A script to generate an atom feed based on a directory of Ello posts backed up
using the el-takeaway tool.  Post files must be named 'post-*.md'.  Posts
beginning with the @ character are excluded.  Posts will be included in the
atom feed in reverse cronological order.

To install, make this file executable, then place in a directory in your
executables path.  (This may be ~/bin).

Usage: el2atom backup_dir > atom.xml

The atom file will be written to the standard output
"""

from __future__ import unicode_literals, print_function, division

import os, sys, string, codecs

from markdown import markdown
import elementflow

class PostReader(object):

    """A class to read post files as created by the el-takeaway tool."""

    def __init__(self, post_file):
        self.datetime = ''
        self.addressed = False
        try:
            self.datetime = post_file.next().rstrip()
            self.first = post_file.next()
            self.addressed = self.first.startswith('@')
        except StopIteration:
            pass
        self.post_file = post_file
        self.asset_dict = {}
        self.asset_lines = []
        self.in_asset = False

    def __iter__(self):
        """Itentify this object as an iterator"""
        return self

    def update_asset_dict(self, line):
        """Store the entry represented by the current JSON line in the asset
        dict"""
        strip_chars = string.whitespace + '"\','
        key, value = [part.strip(strip_chars) for part in
                line.strip().split(':', 1)]
        self.asset_dict[key] = value
        self.asset_lines.append(line)

    def render_asset(self):
        """Return the HTML representation of the current asset"""
        if not all(key in self.asset_dict for key in ['url', 'asset_id']):
            return ''.join(self.asset_lines)
        elif 'alt' in self.asset_dict:
            return '<img src="{url}" alt="{alt}" />\n\n'.format(
                    url=self.asset_dict['url'],
                    alt=self.asset_dict['alt']
                )
        else:
            return '<img src="{url}" />\n\n'.format(
                    url=self.asset_dict['url'],
                )

    def get_next(self):
        """Return the previously retrieved first line and set to None if the
        first line is not None, else return the next line in the file"""
        if self.first:
            line = self.first
            self.first = None
        else:
            line = self.post_file.next()
        return line

    def next(self):
        """Return the next line.  Note that this is currently incorrect as it
        will return a sequence of Nones for each line comprising an asset"""
        line = self.get_next()
        if line.rstrip() == '{':
            self.in_asset = True
            self.asset_dict = {}
        elif line.rstrip() == '}':
            self.in_asset = False
            return self.render_asset()
        elif self.in_asset:
            self.update_asset_dict(line)
        else:
            return line

    def read(self):
        """Read the file contents fully, replacing assets with HTML img tags"""
        return ''.join(line for line in self if line is not None)

    def parse_markdown(self):
        """Return the HTML representation of the markdown body"""
        return markdown(self.read())

def read_dot_et_file(dirname, filename='.et'):
    """Read a dictionary of key/value pairs from a .et file in dirname"""
    path = os.path.join(dirname, filename)
    if not os.path.isfile(path):
        raise ValueError('Invalid .et file: ' + path)
    pairs = {}
    with open(path) as fileh:
        for line in fileh:
            if line.startswith('#'):
                continue
            key, value = [part.strip() for part in line.split('=', 1)]
            pairs[key] = value
    return pairs

def parse_dir(dirname, addressed=False):
    """Yield a sequence of datetime string/body pairs for all the files in a
    directory in reverse name order"""
    for filename in sorted(os.listdir(dirname), reverse=True):
        if filename.startswith('post-') and filename.endswith('.md'):
            path = os.path.join(dirname, filename)
            basename = filename.split('.')[0]
            post_id = basename.split('-')[1]
            with codecs.open(path, encoding='utf8') as post_file:
                reader = PostReader(post_file)
                if not reader.addressed or addressed:
                    yield reader.datetime, post_id, reader.parse_markdown()

def username_from_source(source):
    """Get a username from an Ello JSON url"""
    _, json_file = source.rsplit('/', 1)
    username, _ = json_file.split('.', 1)
    return username

def link_from_source(source):
    """Get the www link from an Ello JSON url"""
    return source.rsplit('.', 1)[0]
    
def emit_post(out_file, xml, post_datetime, post, post_id=None, author=None):
    """Emit the XML pertaining to a single post"""
    with xml.container('entry'):
        if post_id:
            xml.element('id', text='https://ello.co/api/v1/posts/' + post_id.lstrip('0'))
        xml.element('updated', text=post_datetime)
        if author:
            with xml.container('author'):
                xml.element('name', text=author)
        with xml.container('content', attrs={
                'type': 'xhtml'
            }):
            with xml.container('div', attrs={
                    'xmlns': 'http://www.w3.org/1999/xhtml'
                }):
                out_file.write(post)

def emit_atom(out_file, posts, source=None):
    """Generate a bare bones atom feed based on a list of datetime string/body
    pairs"""
    
    username = username_from_source(source) if source else None
    link = link_from_source(source) if source else None

    with elementflow.xml(out_file, u'feed', attrs={
            'xmlns': 'http://www.w3.org/2005/Atom'
        }, indent=True) as xml:

        if source:
            xml.element('title', text='{0} (Ello)'.format(username))
            xml.element('link', attrs={'href': link})

        try:
            post_datetime, post_id, post = next(posts)
            xml.element('updated', text=post_datetime)
            emit_post(out_file, xml, post_datetime, post, author=username)
        except StopIteration:
            return

        for post_datetime, post_id, post in posts:
            emit_post(out_file, xml, post_datetime, post, post_id=post_id,
                    author=username)

def main(out_file, dirname):
    """Main script entry point, to avoid poluting global namespace"""
    et_file = read_dot_et_file(dirname)
    emit_atom(out_file, parse_dir(dirname),
            source=et_file.get('source', None))

if __name__ == '__main__':

    if len(sys.argv) != 2:
        print('Usage: {0} <backup_dir>'.format(sys.argv[0]))
        sys.exit(8)

    main(codecs.getwriter('utf8')(sys.stdout), sys.argv[1])