/usr/share/doc/devhelp/tools/html2xml.py is in devhelp 3.18.1-1ubuntu5.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python
import os.path
import sgmllib
import string
import sys
def does_dict_have_keys (dict, keys):
for key in keys:
if not dict.has_key (key):
return 0
if len(dict) != len(keys):
return 0
return 1
def walk (dict, level=0, parent=None):
if dict.has_key ('order'):
list = dict['order']
else:
list = dict.keys()
for key in list:
if key in ['name', 'order', 'link']:
continue
if dict[key].has_key ('link') and \
does_dict_have_keys (dict[key], ['link']):
link = dict[key]['link']
else:
link = ""
if level:
print '*' * level, key, '-', link
else:
print key, '-', link
walk (dict[key], level + 1, dict)
class BookParser (sgmllib.SGMLParser):
def __init__ (self):
sgmllib.SGMLParser.__init__ (self)
self.a = self.parents = []
self.dict = {}
self.last = self.link = ""
self.is_a = self.level = 0
self.first = 1
def unknown_starttag (self, tag, attrs):
if tag == 'a':
self.is_a = 1
for attr in attrs:
if attr[0] == "href":
self.link = attr[1]
break
if tag in ['dd', 'ul']:
self.parents.append (self.last)
self.level = self.level + 1
def unknown_endtag (self, tag):
if tag == 'a':
self.is_a = 0
if tag in ['dd', 'ul']:
self.level = self.level - 1
self.parents.pop()
def handle_data (self, data):
data = string.strip (data)
if not data or data in [ ">", "<" ]:
return
if self.first:
self.dict['name'] = data
self.first = 0
return
if data == self.dict['name'] or data in [ "Next Page", "Previous Page", "Home", "Next"]:
return
if len (self.parents) == 0:
dict = self.dict
elif len (self.parents) == 1:
dict = self.dict[self.parents[0]]
elif len (self.parents) == 2:
dict = self.dict[self.parents[0]][self.parents[1]]
elif len (self.parents) == 3:
dict = self.dict[self.parents[0]][self.parents[1]][self.parents[2]]
else:
dict = None
if self.is_a:
if dict == None:
return
if not dict.has_key (data):
dict[data] = {}
if not dict.has_key ('order'):
dict['order'] = []
dict['order'].append (data)
dict[data]['link'] = self.link
self.last = data
def parse_book (url):
if os.path.exists (url + "/index.html"):
filename = url + "/index.html"
elif os.path.exists (url + "/book1.html"):
filename = url + "/book1.html"
elif os.path.exists (url):
filename = url
else:
print "Error; Can't find an index :("
raise SystemExit
fd = open (filename)
p = BookParser()
p.feed (fd.read())
p.close()
return p.dict
filename = sys.argv[1]
dict = parse_book (sys.argv[1])
print '<?xml version="1.0"?>'
print '<book title="%s"\nname=""\nbase=""\nlink="%s">' % (dict['name'], os.path.basename (sys.argv[1]))
print '<chapters>'
for chap in dict['order']:
print ' <sub name="%s" link="%s">' % (chap, dict[chap]['link'])
if dict[chap].has_key ('order'):
for sub in dict[chap]['order']:
if not does_dict_have_keys (dict[chap][sub], ['link']):
print ' <sub name="%s" link="%s">' % (sub, dict[chap][sub]['link'])
for sub2 in dict[chap][sub]['order']:
print ' <sub name="%s" link="%s"/>' % (sub2, dict[chap][sub][sub2]['link'])
print ' </sub>'
else:
print ' <sub name="%s" link="%s"/>' % (sub, dict[chap][sub]['link'])
print ' </sub>'
print
print '</chapters>'
print
print '</book>'
|