Help -> wiki converter.

This commit is contained in:
Muthu Subramanian K
2010-11-05 12:40:28 +01:00
committed by Jan Holesovsky
parent c85e985342
commit 962aaaced4
3 changed files with 645 additions and 0 deletions

38
helpcontent2/to-wiki/convall.py Executable file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python
import os, sys
titles = [[]]
def loadallfiles(filename):
global titles
file=open(filename,"r")
for line in file:
title = line.split(";")
titles.append(title)
loadallfiles("alltitles.csv")
for title in titles:
command = ""
outfile = ""
infile = ""
try:
outfile = "wiki/"+title[1].strip()
infile = title[0].strip()
command = "python wikiconv2.py "+infile+" > "+outfile
except:
continue
try:
file = open(outfile,"r")
except:
print "Processing: "+infile
if not os.system(command):
# print "Failed: "+command
# sys.exit(1)
pass
continue
print "Warning: Skipping: "+command
file.close()
sys.exit(1)

View File

@ -0,0 +1,114 @@
#!/usr/bin/env python
import sys
import os
import xml.parsers.expat
title=""
parsing=True
istitle=False
alltitles=[]
def is_present(title):
for i in alltitles:
try:
if i.strip() == title.strip():
return True
except:
return False
return False
def make_unique(title):
n=0
t = title
while is_present(t):
n=n+1
t = title+"_%d"%(n)
return t
replace_text_list = [
["$[officename]","LibreOffice"],
["%PRODUCTNAME","LibreOffice"],
['"+"',"plus"],
['"*"',"star"],
['"-"',"minus"],
['"/"',"slash"],
['"^"',"cap"],
[')','_'],
['(','_'],
['\\','_'],
['/','_']
]
modules_list = [
"sbasic",
"scalc",
"schart",
"sdraw",
"shared",
"simpress",
"smath",
"swriter"
]
def get_module(text):
for i in modules_list:
if text.find(i) >=0:
return i
return ""
def replace_text(text):
for i in replace_text_list:
if text.find(i[0]) >= 0:
text = text.replace(i[0],i[1])
return text
def start_element(name, attrs):
global parsing, istitle
if not parsing:
return
if name == 'title':
istitle=True
def end_element(name):
global parsing, istitle
if not parsing:
return
if name == 'title':
parsign = False
istitle = False
def char_data(data):
global title, parsing
if not istitle:
return
title = replace_text(data)
def parsexhp(filename):
global parsing, title
parsing = True
file=open(filename,"r")
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
buf = file.read()
p.Parse(buf)
file.close()
title=get_module(filename)+"/"+title
title = title.replace(" ","_")
title = make_unique(title)
alltitles.append(title)
print filename+";"+title
if len(sys.argv) < 2:
print "getalltitles.py <directory>"
print "e.g. getalltitles.py helcontent2/source/text/scalc"
sys.exit(1)
pattern = "xhp"
for root, dirs, files in os.walk(sys.argv[1]):
for i in files:
if i.find(pattern) >= 0:
parsexhp(root+"/"+i)

493
helpcontent2/to-wiki/wikiconv2.py Executable file
View File

@ -0,0 +1,493 @@
#!/usr/bin/env python
import sys
import xml.parsers.expat
root="helpcontent2/source/"
titles = [[]]
start_eles = [
["emph","'''"]
]
end_eles = [
["emph","'''"]
]
replace_text_list = [
["$[officename]","LibreOffice"],
["%PRODUCTNAME","LibreOffice"]
]
def get_link_filename(link, name):
text = link
if link.find("http") >= 0:
text = name
for title in titles:
try:
if title[0].find(text) >= 0:
return title[1].strip()
except:
pass
return link
def replace_text(text):
for i in replace_text_list:
if text.find(i[0]) >= 0:
text = text.replace(i[0],i[1])
return text
def heading(level):
str=""
for i in range(0,level):
str = str+"="
return str
class cxml:
def __init__(self, sectionid):
self.filter_section=sectionid
self.objects=[]
self.child_parsing=False
self.parser_state=True
self.depth=1
if sectionid != "":
self.parser_state=False
def start_element(self, name, attrs):
if name == 'section':
if self.filter_section != "" and attrs['id'] == self.filter_section:
self.parser_state=True
if name == 'paragraph':
if not self.parser_state:
para=cparagraph(attrs, self, self.filter_section, self.depth)
else:
para=cparagraph(attrs, self, '', self.depth)
self.depth = para.depth
self.child_parsing=True
self.objects.append(para)
if not self.parser_state:
return
if name == 'embed':
link=attrs['href'].replace('"','')
fname=link
section=""
if link.find("#") >= 0:
fname = link[:link.find("#")]
section = link[link.find("#")+1:]
#print "Parsing: "+fname+" Section: "+section
if fname.find("border") >= 0 or \
fname.find("background") >= 0:
print "Ignoring: "+fname
else:
self.child_parsing = True
child_xml = cxml(section)
child_xml.depth = self.depth +1
self.objects.append(child_xml)
parsexhp(root+fname)
self.child_parsing = False
if name == 'table':
child = ctable(attrs, self)
self.child_parsing = True
self.objects.append(child)
def end_element(self, name):
if not self.parser_state:
return
if self.filter_section != "" and name == 'section':
self.parser_state=False
def char_data(self, data):
pass
def get_curobj(self):
if self.child_parsing:
#try:
# raise self.objects[len(self.objects)-1]
#except cxml:
return self.objects[len(self.objects)-1].get_curobj()
#except:
# return self.objects[len(self.objects)-1]
else:
return self
def print_all(self):
for i in self.objects:
i.print_all()
class cimage:
def __init__(self, attrs, parent):
self.src = attrs['src']
try:
self.width = attrs['width']
self.height = attrs['height']
except:
self.width = self.height = ""
self.align = 'left'
self.alt = False
self.alttext = ""
self.parent = parent
def start_element(self, name, attrs):
if name == 'alt':
self.alt = True
def end_element(self, name):
if name == 'alt':
self.alt = False
if name == 'image':
self.parent.child_parsing = False
def char_data(self, data):
if self.alt:
self.alttext = self.alttext + data
def get_all(self):
wikitext = "[[Image:"+self.src+"|border|"+self.align+"|"
if len(self.width):
wikitext = wikitext + self.width+"x"+self.height+"|"
wikitext = wikitext + self.alttext+"]]"
return wikitext
def print_all(self):
print self.get_all()
def get_curobj(self):
return self
class ctext:
def __init__(self, text):
self.wikitext = replace_text(text)
def print_all(self):
print self.wikitext
class ctabcell:
def __init__(self, attrs, parent):
# TODO: colspan rowspan
self.objects = []
self.child_parsing = False
self.parent = parent
self.header = False
pass
def start_element(self, name, attrs):
if name == 'paragraph':
if attrs['role'] == 'tablehead':
self.header = True
para=cparagraph(attrs, self, '', 0)
self.child_parsing=True
self.objects.append(para)
pass
def end_element(self, name):
if name == 'tablecell':
self.parent.child_parsing = False
pass
def char_data(self, data):
pass
def print_all(self):
for i in self.objects:
i.print_all()
def get_all(self):
text = ""
for i in self.objects:
text = text + i.get_all()
return text
def get_curobj(self):
if self.child_parsing:
return self.objects[len(self.objects)-1].get_curobj()
return self
class ctable:
def __init__(self, attrs, parent):
# TODO/Check: Might Require filtering too...
try:
self.tableid = attrs['id']
except:
self.tableid = 0
self.header = []
self.crow = []
self.content = [[]]
self.child_parsing = False
self.child = None
self.parent = parent
def check_add_cell(self):
if self.child:
self.crow.append(self.child)
self.child = None
def check_add_row(self):
if len(self.crow):
if self.crow[0].header:
self.header = self.crow
else:
self.content.append(self.crow)
self.crow = []
def start_element(self, name, attrs):
if name == 'tablecell':
self.check_add_cell()
self.child = ctabcell(attrs, self)
self.child_parsing = True
if name == 'tablerow':
self.check_add_cell()
self.check_add_row()
def end_element(self, name):
if name == 'table':
# the following checks may be unnecessary
self.check_add_cell()
self.check_add_row()
self.parent.child_parsing = False
def char_data(self, data):
pass
def get_all(self):
text = '{| border="1"' # + ' align="left"'
if len(self.header):
# text = text + "\n|+ caption"
text = text +"\n|-"
for i in self.header:
text = text + '\n! scope="col" | ' + i.get_all()
for i in self.content:
text = text + "\n|-"
for j in i:
text = text + "\n| "+j.get_all()
text = text + "\n|}"
return text
def print_all(self):
print self.get_all().encode('ascii','replace')
def get_curobj(self):
if self.child_parsing:
return self.child.get_curobj()
return self
class clink:
def __init__(self, attrs, parent):
self.link = attrs['href']
try:
self.lname = attrs['name']
except:
self.lname = self.link[self.link.rfind("/")+1:]
# Override lname
self.lname = get_link_filename(self.link, self.lname)
self.wikitext = ""
self.parent = parent
def start_element(self, name, attrs):
pass
def end_element(self, name):
if name == "link":
self.parent.child_parsing = False
def char_data(self, data):
self.wikitext = self.wikitext + data
def get_all(self):
if self.link.find("http") >= 0:
text = "["+self.link+" "+self.wikitext+"]"
else:
text = "[["+self.lname+"|"+self.wikitext+"]]"
if self.parent.heading:
text = heading(self.parent.depth) + " " + text + " "+heading(self.parent.depth)
text = replace_text(text)
return text
def print_all(self):
print self.get_all()
def get_curobj(self):
return self
# Not used yet - cparagraph itself handles it (as of now)
class cvariable:
def __init__(self, sectionid, parent):
self.parser_state=True
self.wikitext=""
if sectionid != "" and attrs['id']==sectionid:
self.parser_state=False
self.parent = parent
def start_element(self, name, attrs):
pass
def end_element(self,name):
if name == 'variable':
parent.child_parsing = False
def print_all(self):
print self.wikitext
class cparagraph:
def __init__(self, attrs, parent, sectionid, depth):
self.child_parsing = False
self.heading=False
try:
if attrs['role'] == "heading":
self.heading = True
except:
pass
#try:
# self.level=parent.level+1
#except:
try:
self.level=int(attrs['level'])
except:
self.level=0
self.filter_section=sectionid
self.parent = parent
self.objects=[]
self.parser_state=True
if depth > self.level:
self.depth = depth
else:
self.depth = self.level
self.wikitext=""
if sectionid != "":
self.parser_state = False
def __del__(self):
pass
def start_element(self, name, attrs):
if name == 'variable':
if attrs['id'] == self.filter_section:
self.parser_state=True
if name == 'paragraph':
if not self.parser_state:
child = cparagraph(attrs, self, self.filter_section, self.depth+1)
else:
child = cparagraph(attrs, self, "", self.depth+1)
self.child_parsing = True
self.objects.append(child)
if not self.parser_state:
return
if name == 'embed':
# This shouldn't occur
print "Warning: Skipped Embedded content!!!"
if name == 'image':
child = cimage(attrs, self)
self.child_parsing = True
self.objects.append(child)
if name == 'link':
child = clink(attrs, self)
self.child_parsing = True
self.objects.append(child)
global start_eles
for n in start_eles:
if n[0] == name:
#self.wikitext=self.wikitext+n[1]
self.objects.append(ctext(n[1]))
break
def end_element(self, name):
if name == 'paragraph':
self.parent.child_parsing = False
if not self.parser_state:
return
if self.filter_section != "" and name == 'varable':
self.parser_state = False
global end_eles
for n in end_eles:
if n[0] == name:
#self.wikitext=self.wikitext+n[1]
self.objects.append(ctext(n[1]))
break
def char_data(self, data):
if not self.parser_state or not len(data.strip()):
return
text=""
if self.heading:
text = heading(self.depth) + " " + data + " "+heading(self.depth)
else:
text = data
self.objects.append(ctext(text))
#self.wikitext = self.wikitext + text
def print_all(self):
#if self.wikitext != "":
# print self.wikitext
text = self.get_all()
if len(text):
print text.encode('ascii','replace')
return
for i in self.objects:
try:
raise i
except ctext:
self.wikitext = self.wikitext + i.wikitext
except clink:
self.wikitext = self.wikitext + i.get_all() + " "
except:
if len(self.wikitext):
print self.wikitext
self.wikitext=""
i.print_all()
if len(self.wikitext):
print self.wikitext
def get_all(self):
for i in self.objects:
try:
raise i
except ctext:
self.wikitext = self.wikitext + i.wikitext
except clink:
self.wikitext = self.wikitext + i.get_all() + " "
except:
if len(self.wikitext):
self.wikitext = self.wikitext + "\n"
self.wikitext = self.wikitext + "\n" + i.get_all()
return self.wikitext
def get_curobj(self):
if self.child_parsing:
return self.objects[len(self.objects)-1].get_curobj()
else:
return self
head_obj=cxml("")
def start_element(name, attrs):
head_obj.get_curobj().start_element(name,attrs)
def end_element(name):
head_obj.get_curobj().end_element(name)
def char_data(data):
head_obj.get_curobj().char_data(data)
def parsexhp(filename):
file=open(filename,"r")
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
buf = file.read()
p.Parse(buf)
file.close()
def loadallfiles(filename):
global titles
file=open(filename,"r")
for line in file:
title = line.split(";")
titles.append(title)
if len(sys.argv) < 2:
print "wikiconv2.py <inputfile.xph>"
sys.exit(1)
loadallfiles("alltitles.csv")
parsexhp(sys.argv[1])
head_obj.print_all()