x2blog搬迁到wordpress

前面有讲对xml/html的库选择。既然~~例完假~~利完器，就要善其事了。其实x2blog提供了备份导出的功能，让整个事情变得非常简单。相比之下，歪酷本来有此功能现在却不知为何关闭了，让用户从歪酷搬走变得难上加难。

首先，x2blog导出的是按照月份为单位的日志，如下图所示：

每个xml文件的格式都是相同的，如下图所示：

可以从图里面看到，Info节点的title就是blog的名称，而每个Item对应着一篇日志。这里有点诡异的是所有日志的内容都在Abstract节点里面，而不是Content节点里面，估计是x2blog这个日志程序的bug。

有了这些信息，就可以去生成wordpress可以读懂的wxr文件了。

首先做一个类，初始化的时候就生成那些通用的内容，如xml声明部分等：

def __init__(self, input):
        """Creates the basic document."""
        self.input_tree = input
        self.site = ''

        self.xml = xml.dom.minidom.Document()
        self.rss = self._create_element('rss', self.xml)
        self.rss.setAttribute('xmlns:content',
                              'http://purl.org/rss/1.0/modules/content/')
        self.rss.setAttribute('xmlns:wfw',
                              'http://wellformedweb.org/CommentAPI/')
        self.rss.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/')
        self.rss.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/')
        self.channel = self._create_element('channel', self.rss)

然后就需要分别生成wordpress的cata、tag、item、comment这四部分内容。不过x2blog的导出备份文件里面并没有tag信息（又是一个bug吧），所以tag先不管。

日志分类处理

def create_category(self, nicename, name=""):
        """Creates a Category."""
        if name != "":
            category = self._create_element('wp:category', self.channel)
            self._create_element('wp:category_nicename', category, nicename)
            self._create_element('wp:category_parent', category)
            element = self._create_element('wp:cat_name', category)
            self._cdata(name, element)

日志

def create_item(self, data):
        """Creates an item from the Item element in the tree."""
        linkpath = datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%Y/%m/%d')
        link = "%s/%s/%s" % (self.site, linkpath, data[].text.encode("utf-8"))
        item = self._create_element('item', self.channel)
        self._create_element('title', item, data[].text.encode("utf-8"))
        self._create_element('link', item, link)
        self._create_element('pubDate', item,
                            datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%a, %d %b %Y %H:%M%S +0000'))
        self._create_element('dc:creator', item, 'admin')
        self.item_categories(item, data[2].text.encode("utf-8"))
        #no tag info, just left there
        #self.item_tags(item, xxx)
        guid = self._create_element('guid', item, link)
        guid.setAttribute('isPermaLink', 'true')
        self._create_element('description', item)
        element = self._create_element('content:encoded', item)
        self._cdata(data[4].text.encode("utf-8"), element)
        #self._create_element('wp:post_id', item, data[0])
        self._create_element('wp:post_date', item, data[1].text)
        self._create_element('wp:post_date_gmt', item, data[1].text)
        comments = 'open'
        self._create_element('wp:comment_status', item, comments)
        self._create_element('wp:ping_status', item, 'open')
        self._create_element('wp:post_name', item, data[].text.encode("utf-8"))
        self._create_element('wp:status', item, 'publish')
        self._create_element('wp:post_parent', item, '0')
        self._create_element('wp:menu_item', item, '0')
        self._create_element('wp:post_type', item, 'post')
        self.item_comments(item, data[7])

def item_comments(self, item, comment_elems):
        """Creates comments for an item."""
        for elem in comment_elems:
            comment = self._create_element('wp:comment', item)
            #self._create_element('wp:comment_id', comment, elem[0])
            element = self._create_element('wp:comment_author', comment)
            self._cdata(elem[1].text.encode("utf-8"), element)
            self._create_element('wp:comment_author_email', comment, 'x@y.com')
            self._create_element('wp:comment_author_url', comment, 'http://xxx')
            self._create_element('wp:comment_author_IP', comment, elem[2].text.encode("utf-8"))
            self._create_element('wp:comment_date', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_date_gmt', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_content', comment, elem[].text.encode("utf-8"))
            self._create_element('wp:comment_approved', comment, '1')
            self._create_element('wp:comment_type', comment)
            self._create_element('wp:comment_parent', comment, '0')

实际的文件解析处理过程，都放到了另外一个类里面来使得结构清晰。完整的代码：

"""Script to generate a WordPress eXtended RSS (WXR) file from a x2blog backup file
"""

#import psycopg2
import re
import sys
import xml.dom.minidom
from lxml import etree

from datetime import datetime
from optparse import OptionParser

class Export(object):
    """Handles the details of creating a WordPress eXtended RSS (WXR)."""

    def __init__(self, input):
        """Creates the basic document."""
        self.input_tree = input
        self.site = ''

        self.xml = xml.dom.minidom.Document()
        self.rss = self._create_element('rss', self.xml)
        self.rss.setAttribute('xmlns:content',
                              'http://purl.org/rss/1.0/modules/content/')
        self.rss.setAttribute('xmlns:wfw',
                              'http://wellformedweb.org/CommentAPI/')
        self.rss.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/')
        self.rss.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/')
        self.channel = self._create_element('channel', self.rss)

    def _create_element(self, name, parent=None, value=None):
        """Helper function for creating XML elements.

        Parameters:
        :param name: The name of the element
        :type name: ``str``
        :param parent: (Optional) The XML node you want this element to be a
        child of.
        :type b: ``Node``

        :return: The element
        :rtype: ``Node``.
        """
        element = self.xml.createElement(str(name))

        if parent:
            parent.appendChild(element)

        if value:
            element_value = self.xml.createTextNode(str(value))
            element.appendChild(element_value)

        return element

    def display(self):
        """Returns the formatted XML document."""
        return self.xml.toprettyxml('')

    def create_site_info(self, title, url, description):
        """Populates the site information."""
        self._create_element('title', self.channel, title)
        self.site = url
        self._create_element('link', self.channel, url)
        self._create_element('description', self.channel, description)
        self._create_element('pubDate', self.channel, datetime.utcnow())
        self._create_element('generator', self.channel, 'x2wp.py')
        self._create_element('language', self.channel, 'cn')

    def create_category(self, nicename, name=""):
        """Creates a Category."""
        if name != "":
            category = self._create_element('wp:category', self.channel)
            self._create_element('wp:category_nicename', category, nicename)
            self._create_element('wp:category_parent', category)
            element = self._create_element('wp:cat_name', category)
            self._cdata(name, element)


    def create_item(self, data):
        """Creates an item from the Item element in the tree."""
        linkpath = datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%Y/%m/%d')
        link = "%s/%s/%s" % (self.site, linkpath, data[0].text.encode("utf-8"))
        item = self._create_element('item', self.channel)
        self._create_element('title', item, data[0].text.encode("utf-8"))
        self._create_element('link', item, link)
        self._create_element('pubDate', item,
                            datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%a, %d %b %Y %H:%M%S +0000'))
        self._create_element('dc:creator', item, 'admin')
        self.item_categories(item, data[2].text.encode("utf-8"))
        #no tag info, just left there
        #self.item_tags(item, xxx)
        guid = self._create_element('guid', item, link)
        guid.setAttribute('isPermaLink', 'true')
        self._create_element('description', item)
        element = self._create_element('content:encoded', item)
        self._cdata(data[4].text.encode("utf-8"), element)
        #self._create_element('wp:post_id', item, data[0])
        self._create_element('wp:post_date', item, data[1].text)
        self._create_element('wp:post_date_gmt', item, data[1].text)
        comments = 'open'
        self._create_element('wp:comment_status', item, comments)
        self._create_element('wp:ping_status', item, 'open')
        self._create_element('wp:post_name', item, data[0].text.encode("utf-8"))
        self._create_element('wp:status', item, 'publish')
        self._create_element('wp:post_parent', item, '0')
        self._create_element('wp:menu_item', item, '0')
        self._create_element('wp:post_type', item, 'post')
        self.item_comments(item, data[7])

    def item_categories(self, item, cata):
        """Links an item to categories."""
        element = self._create_element('category', item)
        self._cdata(cata, element)

    def create_tag(self, name):
        """Creates a Tag."""

    def item_tags(self, item, item_id):
        """Links an item to tags."""

    def item_comments(self, item, comment_elems):
        """Creates comments for an item."""
        for elem in comment_elems:
            comment = self._create_element('wp:comment', item)
            #self._create_element('wp:comment_id', comment, elem[0])
            element = self._create_element('wp:comment_author', comment)
            self._cdata(elem[1].text.encode("utf-8"), element)
            self._create_element('wp:comment_author_email', comment, 'x@y.com')
            self._create_element('wp:comment_author_url', comment, 'http://xxx')
            self._create_element('wp:comment_author_IP', comment, elem[2].text.encode("utf-8"))
            self._create_element('wp:comment_date', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_date_gmt', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_content', comment, elem[0].text.encode("utf-8"))
            self._create_element('wp:comment_approved', comment, '1')
            self._create_element('wp:comment_type', comment)
            self._create_element('wp:comment_parent', comment, '0')

    def _cdata(self, data, parent):
        """Helper function for creating CDATA sections."""
        cdata = self.xml.createCDATASection(data)
        parent.appendChild(cdata)

    def finalise(self):
        """Final cleanup."""
        wxr = self.display()
        return re.sub('>\n<!', '><!', wxr)

class Exporter(object):
    """Handles the wrap process."""

    def __init__(self, options):
        self.output_file = options.output
        self.input_tree = self._get_input()
        self.wxr = Export(self.input_tree)

    def _get_input(self):
        """get a etree from file."""
        try:
            parser = etree.XMLParser(ns_clean=True)
            input_tree = etree.parse(options.input ,parser)

        except(lxml.etree.XMLSyntaxError):
            sys.exit("I am unable to parse the file you supplied. Sorry.")

        return input_tree


    def _process_input_tree(self):
        """Creates the basic document. No tags because there are not in the x2blog backup file.
        """
        self.wxr.create_site_info("the site info", options.url, "description")

        for child in self.input_tree.getroot():
            #self.wxr.create_tag()
            self.wxr.create_category(child[2].text.encode("utf-8"),child[2].text.encode("utf-8"))
            self.wxr.create_item(child)

    def export(self):
        """Generates the WXR."""
        self._process_input_tree()
        output = self.wxr.finalise()

        if self.output_file:
            out = open(self.output_file,'w')
            out.write(output)
            out.close()
        else:
            print output


def parseoptions(args):
    """Parses command line options."""
    parser = OptionParser()
    parser.add_option("-i", "--input", type="string",
                      help="The filename to be process.")
    parser.add_option("-o", "--output", type="string",
                      help="The filename where you want the output stored.")
    parser.add_option("-u", "--url", default="https://lenciel.com",type="string",
                  help="The url you use for your new wordpress blog.")
    return parser.parse_args(args)[0]

if __name__ == '__main__':
    options = parseoptions(sys.argv)
    exporter = Exporter(options)
    exporter.export()

@Lenciel

x2blog搬迁到wordpress

欢迎留言