@Lenciel

x2blog搬迁到wordpress

前面有讲对xml/html的库选择。既然例完假利完器,就要善其事了。其实x2blog提供了备份导出的功能,让整个事情变得非常简单。相比之下,歪酷本来有此功能现在却不知为何关闭了,让用户从歪酷搬走变得难上加难。

首先,x2blog导出的是按照月份为单位的日志,如下图所示:

x2w1

每个xml文件的格式都是相同的,如下图所示:

x2w1

可以从图里面看到,Info节点的title就是blog的名称,而每个Item对应着一篇日志。这里有点诡异的是所有日志的内容都在Abstract节点里面,而不是Content节点里面,估计是x2blog这个日志程序的bug。

有了这些信息,就可以去生成wordpress可以读懂的wxr文件了。

首先做一个类,初始化的时候就生成那些通用的内容,如xml声明部分等:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def __init__(self, input):
        """Creates the basic document."""
        self.input_tree = input
        self.site = ''

        self.xml = xml.dom.minidom.Document()
        self.rss = self._create_element('rss', self.xml)
        self.rss.setAttribute('xmlns:content',
                              'http://purl.org/rss/1.0/modules/content/')
        self.rss.setAttribute('xmlns:wfw',
                              'http://wellformedweb.org/CommentAPI/')
        self.rss.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/')
        self.rss.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/')
        self.channel = self._create_element('channel', self.rss)

然后就需要分别生成wordpress的cata、tag、item、comment这四部分内容。不过x2blog的导出备份文件里面并没有tag信息(又是一个bug吧),所以tag先不管。

日志分类处理

1
2
3
4
5
6
7
8
def create_category(self, nicename, name=""):
        """Creates a Category."""
        if name != "":
            category = self._create_element('wp:category', self.channel)
            self._create_element('wp:category_nicename', category, nicename)
            self._create_element('wp:category_parent', category)
            element = self._create_element('wp:cat_name', category)
            self._cdata(name, element)

日志

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def create_item(self, data):
        """Creates an item from the Item element in the tree."""
        linkpath = datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%Y/%m/%d')
        link = "%s/%s/%s" % (self.site, linkpath, data[].text.encode("utf-8"))
        item = self._create_element('item', self.channel)
        self._create_element('title', item, data[].text.encode("utf-8"))
        self._create_element('link', item, link)
        self._create_element('pubDate', item,
                            datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%a, %d %b %Y %H:%M%S +0000'))
        self._create_element('dc:creator', item, 'admin')
        self.item_categories(item, data[2].text.encode("utf-8"))
        #no tag info, just left there
        #self.item_tags(item, xxx)
        guid = self._create_element('guid', item, link)
        guid.setAttribute('isPermaLink', 'true')
        self._create_element('description', item)
        element = self._create_element('content:encoded', item)
        self._cdata(data[4].text.encode("utf-8"), element)
        #self._create_element('wp:post_id', item, data[0])
        self._create_element('wp:post_date', item, data[1].text)
        self._create_element('wp:post_date_gmt', item, data[1].text)
        comments = 'open'
        self._create_element('wp:comment_status', item, comments)
        self._create_element('wp:ping_status', item, 'open')
        self._create_element('wp:post_name', item, data[].text.encode("utf-8"))
        self._create_element('wp:status', item, 'publish')
        self._create_element('wp:post_parent', item, '0')
        self._create_element('wp:menu_item', item, '0')
        self._create_element('wp:post_type', item, 'post')
        self.item_comments(item, data[7])

评论

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def item_comments(self, item, comment_elems):
        """Creates comments for an item."""
        for elem in comment_elems:
            comment = self._create_element('wp:comment', item)
            #self._create_element('wp:comment_id', comment, elem[0])
            element = self._create_element('wp:comment_author', comment)
            self._cdata(elem[1].text.encode("utf-8"), element)
            self._create_element('wp:comment_author_email', comment, 'x@y.com')
            self._create_element('wp:comment_author_url', comment, 'http://xxx')
            self._create_element('wp:comment_author_IP', comment, elem[2].text.encode("utf-8"))
            self._create_element('wp:comment_date', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_date_gmt', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_content', comment, elem[].text.encode("utf-8"))
            self._create_element('wp:comment_approved', comment, '1')
            self._create_element('wp:comment_type', comment)
            self._create_element('wp:comment_parent', comment, '0')

实际的文件解析处理过程,都放到了另外一个类里面来使得结构清晰。完整的代码:

(x2wp.py) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""Script to generate a WordPress eXtended RSS (WXR) file from a x2blog backup file
"""

# 2008 Lenciel <lenciel@gmail.com>.

#import psycopg2
import re
import sys
import xml.dom.minidom
from lxml import etree

from datetime import datetime
from optparse import OptionParser

class Export(object):
    """Handles the details of creating a WordPress eXtended RSS (WXR)."""

    def __init__(self, input):
        """Creates the basic document."""
        self.input_tree = input
        self.site = ''

        self.xml = xml.dom.minidom.Document()
        self.rss = self._create_element('rss', self.xml)
        self.rss.setAttribute('xmlns:content',
                              'http://purl.org/rss/1.0/modules/content/')
        self.rss.setAttribute('xmlns:wfw',
                              'http://wellformedweb.org/CommentAPI/')
        self.rss.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/')
        self.rss.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/')
        self.channel = self._create_element('channel', self.rss)

    def _create_element(self, name, parent=None, value=None):
        """Helper function for creating XML elements.

        Parameters:
        :param name: The name of the element
        :type name: ``str``
        :param parent: (Optional) The XML node you want this element to be a
        child of.
        :type b: ``Node``

        :return: The element
        :rtype: ``Node``.
        """
        element = self.xml.createElement(str(name))

        if parent:
            parent.appendChild(element)

        if value:
            element_value = self.xml.createTextNode(str(value))
            element.appendChild(element_value)

        return element

    def display(self):
        """Returns the formatted XML document."""
        return self.xml.toprettyxml('')

    def create_site_info(self, title, url, description):
        """Populates the site information."""
        self._create_element('title', self.channel, title)
        self.site = url
        self._create_element('link', self.channel, url)
        self._create_element('description', self.channel, description)
        self._create_element('pubDate', self.channel, datetime.utcnow())
        self._create_element('generator', self.channel, 'x2wp.py')
        self._create_element('language', self.channel, 'cn')

    def create_category(self, nicename, name=""):
        """Creates a Category."""
        if name != "":
            category = self._create_element('wp:category', self.channel)
            self._create_element('wp:category_nicename', category, nicename)
            self._create_element('wp:category_parent', category)
            element = self._create_element('wp:cat_name', category)
            self._cdata(name, element)


    def create_item(self, data):
        """Creates an item from the Item element in the tree."""
        linkpath = datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%Y/%m/%d')
        link = "%s/%s/%s" % (self.site, linkpath, data[0].text.encode("utf-8"))
        item = self._create_element('item', self.channel)
        self._create_element('title', item, data[0].text.encode("utf-8"))
        self._create_element('link', item, link)
        self._create_element('pubDate', item,
                            datetime.strptime(data[1].text,"%Y-%m-%d %H:%M:%S").strftime('%a, %d %b %Y %H:%M%S +0000'))
        self._create_element('dc:creator', item, 'admin')
        self.item_categories(item, data[2].text.encode("utf-8"))
        #no tag info, just left there
        #self.item_tags(item, xxx)
        guid = self._create_element('guid', item, link)
        guid.setAttribute('isPermaLink', 'true')
        self._create_element('description', item)
        element = self._create_element('content:encoded', item)
        self._cdata(data[4].text.encode("utf-8"), element)
        #self._create_element('wp:post_id', item, data[0])
        self._create_element('wp:post_date', item, data[1].text)
        self._create_element('wp:post_date_gmt', item, data[1].text)
        comments = 'open'
        self._create_element('wp:comment_status', item, comments)
        self._create_element('wp:ping_status', item, 'open')
        self._create_element('wp:post_name', item, data[0].text.encode("utf-8"))
        self._create_element('wp:status', item, 'publish')
        self._create_element('wp:post_parent', item, '0')
        self._create_element('wp:menu_item', item, '0')
        self._create_element('wp:post_type', item, 'post')
        self.item_comments(item, data[7])

    def item_categories(self, item, cata):
        """Links an item to categories."""
        element = self._create_element('category', item)
        self._cdata(cata, element)

    def create_tag(self, name):
        """Creates a Tag."""

    def item_tags(self, item, item_id):
        """Links an item to tags."""

    def item_comments(self, item, comment_elems):
        """Creates comments for an item."""
        for elem in comment_elems:
            comment = self._create_element('wp:comment', item)
            #self._create_element('wp:comment_id', comment, elem[0])
            element = self._create_element('wp:comment_author', comment)
            self._cdata(elem[1].text.encode("utf-8"), element)
            self._create_element('wp:comment_author_email', comment, 'x@y.com')
            self._create_element('wp:comment_author_url', comment, 'http://xxx')
            self._create_element('wp:comment_author_IP', comment, elem[2].text.encode("utf-8"))
            self._create_element('wp:comment_date', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_date_gmt', comment, elem[3].text.encode("utf-8"))
            self._create_element('wp:comment_content', comment, elem[0].text.encode("utf-8"))
            self._create_element('wp:comment_approved', comment, '1')
            self._create_element('wp:comment_type', comment)
            self._create_element('wp:comment_parent', comment, '0')

    def _cdata(self, data, parent):
        """Helper function for creating CDATA sections."""
        cdata = self.xml.createCDATASection(data)
        parent.appendChild(cdata)

    def finalise(self):
        """Final cleanup."""
        wxr = self.display()
        return re.sub('>\n<!', '><!', wxr)

class Exporter(object):
    """Handles the wrap process."""

    def __init__(self, options):
        self.output_file = options.output
        self.input_tree = self._get_input()
        self.wxr = Export(self.input_tree)

    def _get_input(self):
        """get a etree from file."""
        try:
            parser = etree.XMLParser(ns_clean=True)
            input_tree = etree.parse(options.input ,parser)

        except(lxml.etree.XMLSyntaxError):
            sys.exit("I am unable to parse the file you supplied. Sorry.")

        return input_tree


    def _process_input_tree(self):
        """Creates the basic document. No tags because there are not in the x2blog backup file.
        """
        self.wxr.create_site_info("the site info", options.url, "description")

        for child in self.input_tree.getroot():
            #self.wxr.create_tag()
            self.wxr.create_category(child[2].text.encode("utf-8"),child[2].text.encode("utf-8"))
            self.wxr.create_item(child)

    def export(self):
        """Generates the WXR."""
        self._process_input_tree()
        output = self.wxr.finalise()

        if self.output_file:
            out = open(self.output_file,'w')
            out.write(output)
            out.close()
        else:
            print output


def parseoptions(args):
    """Parses command line options."""
    parser = OptionParser()
    parser.add_option("-i", "--input", type="string",
                      help="The filename to be process.")
    parser.add_option("-o", "--output", type="string",
                      help="The filename where you want the output stored.")
    parser.add_option("-u", "--url", default="http://lenciel.com",type="string",
                  help="The url you use for your new wordpress blog.")
    return parser.parse_args(args)[0]

if __name__ == '__main__':
    options = parseoptions(sys.argv)
    exporter = Exporter(options)
    exporter.export()