Help -> wiki converter.

2010-11-05 12:40:28 +01:00
parent c85e985342
commit 962aaaced4
3 changed files with 645 additions and 0 deletions
--- a/helpcontent2/to-wiki/convall.py
+++ b/helpcontent2/to-wiki/convall.py
@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+import os, sys
+
+titles = [[]]
+
+def loadallfiles(filename):
+    global titles
+    file=open(filename,"r")
+    for line in file:
+        title = line.split(";")
+        titles.append(title)
+
+loadallfiles("alltitles.csv")
+
+for title in titles:
+    command = ""
+    outfile = ""
+    infile  = ""
+    try:
+        outfile = "wiki/"+title[1].strip()
+        infile  = title[0].strip()
+        command = "python wikiconv2.py "+infile+" > "+outfile
+    except:
+        continue
+
+    try:
+        file = open(outfile,"r")
+    except:
+        print "Processing: "+infile
+        if not os.system(command):
+        #    print "Failed: "+command
+        #    sys.exit(1)
+            pass
+        continue
+    print "Warning: Skipping: "+command
+    file.close()
+    sys.exit(1)
--- a/helpcontent2/to-wiki/getalltitles.py
+++ b/helpcontent2/to-wiki/getalltitles.py
@ -0,0 +1,114 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import xml.parsers.expat
+
+title=""
+parsing=True
+istitle=False
+alltitles=[]
+
+def is_present(title):
+    for i in alltitles:
+        try:
+            if i.strip() == title.strip():
+                return True
+        except:
+            return False
+    return False
+
+def make_unique(title):
+    n=0
+    t = title
+    while is_present(t):
+        n=n+1
+        t = title+"_%d"%(n)
+    return t
+
+replace_text_list = [
+        ["$[officename]","LibreOffice"],
+        ["%PRODUCTNAME","LibreOffice"],
+        ['"+"',"plus"],
+        ['"*"',"star"],
+        ['"-"',"minus"],
+        ['"/"',"slash"],
+        ['"^"',"cap"],
+        [')','_'],
+        ['(','_'],
+        ['\\','_'],
+        ['/','_']
+        ]
+
+modules_list = [
+        "sbasic",
+        "scalc",
+        "schart",
+        "sdraw",
+        "shared",
+        "simpress",
+        "smath",
+        "swriter"
+        ]
+
+def get_module(text):
+    for i in modules_list:
+        if text.find(i) >=0:
+            return i
+    return ""
+
+def replace_text(text):
+    for i in replace_text_list:
+        if text.find(i[0]) >= 0:
+            text = text.replace(i[0],i[1])
+    return text
+
+def start_element(name, attrs):
+    global parsing, istitle
+    if not parsing:
+        return
+    if name == 'title':
+        istitle=True
+
+def end_element(name):
+    global parsing, istitle
+    if not parsing:
+        return
+    if name == 'title':
+        parsign = False
+        istitle = False
+
+def char_data(data):
+    global title, parsing
+    if not istitle:
+        return
+    title = replace_text(data)
+
+def parsexhp(filename):
+    global parsing, title
+    parsing = True
+    file=open(filename,"r")
+    p = xml.parsers.expat.ParserCreate()
+    p.StartElementHandler = start_element
+    p.EndElementHandler = end_element
+    p.CharacterDataHandler = char_data
+    buf = file.read()
+    p.Parse(buf)
+    file.close()
+    title=get_module(filename)+"/"+title
+    title = title.replace(" ","_")
+    title = make_unique(title)
+    alltitles.append(title)
+    print filename+";"+title
+
+if len(sys.argv) < 2:
+    print "getalltitles.py <directory>"
+    print "e.g. getalltitles.py helcontent2/source/text/scalc"
+    sys.exit(1)
+
+pattern = "xhp"
+
+for root, dirs, files in os.walk(sys.argv[1]):
+    for i in files:
+        if i.find(pattern) >= 0:
+            parsexhp(root+"/"+i)
--- a/helpcontent2/to-wiki/wikiconv2.py
+++ b/helpcontent2/to-wiki/wikiconv2.py
@ -0,0 +1,493 @@
+#!/usr/bin/env python
+
+import sys
+import xml.parsers.expat
+
+root="helpcontent2/source/"
+
+titles = [[]]
+
+start_eles = [
+        ["emph","'''"]
+        ]
+
+end_eles = [
+        ["emph","'''"]
+        ]
+
+replace_text_list = [
+        ["$[officename]","LibreOffice"],
+        ["%PRODUCTNAME","LibreOffice"]
+        ]
+
+def get_link_filename(link, name):
+    text = link
+    if link.find("http") >= 0:
+        text = name
+    for title in titles:
+        try:
+            if title[0].find(text) >= 0:
+                return title[1].strip()
+        except:
+            pass
+    return link
+
+def replace_text(text):
+    for i in replace_text_list:
+        if text.find(i[0]) >= 0:
+            text = text.replace(i[0],i[1])
+    return text
+
+def heading(level):
+    str=""
+    for i in range(0,level):
+        str = str+"="
+    return str
+
+class cxml:
+    def __init__(self, sectionid):
+        self.filter_section=sectionid
+        self.objects=[]
+        self.child_parsing=False
+        self.parser_state=True
+        self.depth=1
+        if sectionid != "":
+            self.parser_state=False
+    def start_element(self, name, attrs):
+        if name == 'section':
+            if self.filter_section != "" and attrs['id'] == self.filter_section:
+                self.parser_state=True
+        if name == 'paragraph':
+            if not self.parser_state:
+                para=cparagraph(attrs, self, self.filter_section, self.depth)
+            else:
+                para=cparagraph(attrs, self, '', self.depth)
+            self.depth = para.depth
+            self.child_parsing=True
+            self.objects.append(para)
+        if not self.parser_state:
+            return
+        if name == 'embed':
+            link=attrs['href'].replace('"','')
+            fname=link
+            section=""
+            if link.find("#") >= 0:
+                fname = link[:link.find("#")]
+                section = link[link.find("#")+1:]
+            #print "Parsing: "+fname+" Section: "+section
+            if fname.find("border") >= 0 or \
+               fname.find("background") >= 0:
+                print "Ignoring: "+fname
+            else:
+                self.child_parsing = True
+                child_xml = cxml(section)
+                child_xml.depth = self.depth +1
+                self.objects.append(child_xml)
+                parsexhp(root+fname)
+                self.child_parsing = False
+
+        if name == 'table':
+            child = ctable(attrs, self)
+            self.child_parsing = True
+            self.objects.append(child)
+
+    def end_element(self, name):
+        if not self.parser_state:
+            return
+        if self.filter_section != "" and name == 'section':
+            self.parser_state=False
+    def char_data(self, data):
+        pass
+    def get_curobj(self):
+        if self.child_parsing:
+            #try:
+            #    raise self.objects[len(self.objects)-1]
+            #except cxml:
+            return self.objects[len(self.objects)-1].get_curobj()
+            #except:
+            #    return self.objects[len(self.objects)-1]
+        else:
+            return self
+    def print_all(self):
+        for i in self.objects:
+            i.print_all()
+
+
+class cimage:
+    def __init__(self, attrs, parent):
+        self.src     = attrs['src']
+        try:
+            self.width   = attrs['width']
+            self.height  = attrs['height']
+        except:
+            self.width = self.height = ""
+        self.align   = 'left'
+        self.alt     = False
+        self.alttext = ""
+        self.parent  = parent
+
+    def start_element(self, name, attrs):
+        if name == 'alt':
+            self.alt = True
+
+    def end_element(self, name):
+        if name == 'alt':
+            self.alt = False
+
+        if name == 'image':
+            self.parent.child_parsing = False
+
+    def char_data(self, data):
+        if self.alt:
+            self.alttext = self.alttext + data
+
+    def get_all(self):
+        wikitext = "[[Image:"+self.src+"|border|"+self.align+"|"
+        if len(self.width):
+            wikitext = wikitext + self.width+"x"+self.height+"|"
+        wikitext = wikitext + self.alttext+"]]"
+        return wikitext
+
+    def print_all(self):
+        print self.get_all()
+
+    def get_curobj(self):
+        return self
+
+class ctext:
+    def __init__(self, text):
+        self.wikitext = replace_text(text)
+    def print_all(self):
+        print self.wikitext
+
+class ctabcell:
+    def __init__(self, attrs, parent):
+        # TODO: colspan rowspan
+        self.objects = []
+        self.child_parsing = False
+        self.parent = parent
+        self.header = False
+        pass
+
+    def start_element(self, name, attrs):
+        if name == 'paragraph':
+            if attrs['role'] == 'tablehead':
+                self.header = True
+            para=cparagraph(attrs, self, '', 0)
+            self.child_parsing=True
+            self.objects.append(para)
+        pass
+
+    def end_element(self, name):
+        if name == 'tablecell':
+            self.parent.child_parsing = False
+        pass
+
+    def char_data(self, data):
+        pass
+
+    def print_all(self):
+        for i in self.objects:
+            i.print_all()
+
+    def get_all(self):
+        text = ""
+        for i in self.objects:
+            text = text + i.get_all()
+        return text
+
+    def get_curobj(self):
+        if self.child_parsing:
+            return self.objects[len(self.objects)-1].get_curobj()
+        return self
+
+
+class ctable:
+    def __init__(self, attrs, parent):
+        # TODO/Check: Might Require filtering too...
+        try:
+            self.tableid    = attrs['id']
+        except:
+            self.tableid    = 0
+        self.header         = []
+        self.crow           = []
+        self.content        = [[]]
+        self.child_parsing  = False
+        self.child          = None
+        self.parent         = parent
+
+    def check_add_cell(self):
+        if self.child:
+            self.crow.append(self.child)
+            self.child = None
+
+    def check_add_row(self):
+        if len(self.crow):
+            if self.crow[0].header:
+                self.header = self.crow
+            else:
+                self.content.append(self.crow)
+            self.crow = []
+
+    def start_element(self, name, attrs):
+        if name == 'tablecell':
+            self.check_add_cell()
+            self.child = ctabcell(attrs, self)
+            self.child_parsing = True
+        if name == 'tablerow':
+            self.check_add_cell()
+            self.check_add_row()
+
+    def end_element(self, name):
+        if name == 'table':
+            # the following checks may be unnecessary
+            self.check_add_cell()
+            self.check_add_row()
+            self.parent.child_parsing = False
+
+    def char_data(self, data):
+        pass
+
+    def get_all(self):
+        text = '{| border="1"' # + ' align="left"'
+        if len(self.header):
+            # text = text + "\n|+ caption"
+            text = text +"\n|-"
+            for i in self.header:
+                text = text + '\n! scope="col" | ' + i.get_all()
+        for i in self.content:
+            text = text + "\n|-"
+            for j in i:
+                text = text + "\n| "+j.get_all()
+        text = text + "\n|}"
+        return text
+
+    def print_all(self):
+        print self.get_all().encode('ascii','replace')
+
+    def get_curobj(self):
+        if self.child_parsing:
+            return self.child.get_curobj()
+        return self
+
+class clink:
+    def __init__(self, attrs, parent):
+        self.link       = attrs['href']
+        try:
+            self.lname      = attrs['name']
+        except:
+            self.lname      = self.link[self.link.rfind("/")+1:]
+        # Override lname
+        self.lname = get_link_filename(self.link, self.lname)
+        self.wikitext   = ""
+        self.parent     = parent
+
+    def start_element(self, name, attrs):
+        pass
+
+    def end_element(self, name):
+        if name == "link":
+            self.parent.child_parsing = False
+
+    def char_data(self, data):
+        self.wikitext = self.wikitext + data
+
+    def get_all(self):
+        if self.link.find("http") >= 0:
+            text = "["+self.link+" "+self.wikitext+"]"
+        else:
+            text = "[["+self.lname+"|"+self.wikitext+"]]"
+        if self.parent.heading:
+            text = heading(self.parent.depth) + " " + text + " "+heading(self.parent.depth)
+        text = replace_text(text)
+        return text
+
+    def print_all(self):
+        print self.get_all()
+
+    def get_curobj(self):
+        return self
+
+# Not used yet - cparagraph itself handles it (as of now)
+class cvariable:
+    def __init__(self, sectionid, parent):
+        self.parser_state=True
+        self.wikitext=""
+        if sectionid != "" and attrs['id']==sectionid:
+            self.parser_state=False
+        self.parent = parent
+    def start_element(self, name, attrs):
+        pass
+    def end_element(self,name):
+        if name == 'variable':
+            parent.child_parsing = False
+    def print_all(self):
+        print self.wikitext
+
+class cparagraph:
+    def __init__(self, attrs, parent, sectionid, depth):
+        self.child_parsing = False
+        self.heading=False
+        try:
+            if attrs['role'] == "heading":
+                self.heading = True
+        except:
+            pass
+
+        #try:
+        #    self.level=parent.level+1
+        #except:
+        try:
+            self.level=int(attrs['level'])
+        except:
+            self.level=0
+        self.filter_section=sectionid
+        self.parent = parent
+        self.objects=[]
+        self.parser_state=True
+        if depth > self.level:
+            self.depth = depth
+        else:
+            self.depth = self.level
+        self.wikitext=""
+        if sectionid != "":
+            self.parser_state = False
+    def __del__(self):
+        pass
+    def start_element(self, name, attrs):
+        if name == 'variable':
+            if attrs['id'] == self.filter_section:
+                self.parser_state=True
+        if name == 'paragraph':
+            if not self.parser_state:
+                child = cparagraph(attrs, self, self.filter_section, self.depth+1)
+            else:
+                child = cparagraph(attrs, self, "", self.depth+1)
+            self.child_parsing = True
+            self.objects.append(child)
+
+        if not self.parser_state:
+            return
+        if name == 'embed':
+            # This shouldn't occur
+            print "Warning: Skipped Embedded content!!!"
+        if name == 'image':
+            child = cimage(attrs, self)
+            self.child_parsing = True
+            self.objects.append(child)
+        if name == 'link':
+            child = clink(attrs, self)
+            self.child_parsing = True
+            self.objects.append(child)
+
+
+        global start_eles
+        for n in start_eles:
+            if n[0] == name:
+                #self.wikitext=self.wikitext+n[1]
+                self.objects.append(ctext(n[1]))
+            break
+
+    def end_element(self, name):
+        if name == 'paragraph':
+            self.parent.child_parsing = False
+        if not self.parser_state:
+            return
+        if self.filter_section != "" and name == 'varable':
+            self.parser_state = False
+
+        global end_eles
+        for n in end_eles:
+            if n[0] == name:
+                #self.wikitext=self.wikitext+n[1]
+                self.objects.append(ctext(n[1]))
+            break
+
+    def char_data(self, data):
+        if not self.parser_state or not len(data.strip()):
+            return
+        text=""
+        if self.heading:
+            text = heading(self.depth) + " " + data + " "+heading(self.depth)
+        else:
+            text = data
+        self.objects.append(ctext(text))
+        #self.wikitext = self.wikitext + text
+    def print_all(self):
+        #if self.wikitext != "":
+        #    print self.wikitext
+        text = self.get_all()
+        if len(text):
+            print text.encode('ascii','replace')
+        return
+
+        for i in self.objects:
+            try:
+                raise i
+            except ctext:
+                self.wikitext = self.wikitext + i.wikitext
+            except clink:
+                self.wikitext = self.wikitext + i.get_all() + " "
+            except:
+                if len(self.wikitext):
+                    print self.wikitext
+                    self.wikitext=""
+                i.print_all()
+        if len(self.wikitext):
+            print self.wikitext
+
+    def get_all(self):
+        for i in self.objects:
+            try:
+                raise i
+            except ctext:
+                self.wikitext = self.wikitext + i.wikitext
+            except clink:
+                self.wikitext = self.wikitext + i.get_all() + " "
+            except:
+                if len(self.wikitext):
+                    self.wikitext = self.wikitext + "\n"
+                self.wikitext = self.wikitext + "\n" + i.get_all()
+        return self.wikitext
+
+    def get_curobj(self):
+        if self.child_parsing:
+            return self.objects[len(self.objects)-1].get_curobj()
+        else:
+            return self
+
+head_obj=cxml("")
+def start_element(name, attrs):
+    head_obj.get_curobj().start_element(name,attrs)
+
+def end_element(name):
+    head_obj.get_curobj().end_element(name)
+
+def char_data(data):
+    head_obj.get_curobj().char_data(data)
+
+def parsexhp(filename):
+    file=open(filename,"r")
+    p = xml.parsers.expat.ParserCreate()
+    p.StartElementHandler = start_element
+    p.EndElementHandler = end_element
+    p.CharacterDataHandler = char_data
+    buf = file.read()
+    p.Parse(buf)
+    file.close()
+
+
+def loadallfiles(filename):
+    global titles
+    file=open(filename,"r")
+    for line in file:
+        title = line.split(";")
+        titles.append(title)
+
+if len(sys.argv) < 2:
+    print "wikiconv2.py <inputfile.xph>"
+    sys.exit(1)
+
+loadallfiles("alltitles.csv")
+parsexhp(sys.argv[1])
+head_obj.print_all()