/usr/share/pyshared/pyth/plugins/xhtml/reader.py is in python-pyth 0.5.6-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | """
Read documents from xhtml
"""
import BeautifulSoup
from pyth import document
from pyth.format import PythReader
from pyth.plugins.xhtml.css import CSS
class XHTMLReader(PythReader):
@classmethod
def read(self, source, css_source=None, encoding="utf-8"):
reader = XHTMLReader(source, css_source, encoding)
return reader.go()
def __init__(self, source, css_source=None, encoding="utf-8"):
self.source = source
self.css_source = css_source
self.encoding = encoding
def go(self):
soup = BeautifulSoup.BeautifulSoup(self.source,
convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
fromEncoding=self.encoding,
smartQuotesTo=None)
# Make sure the document content doesn't use multi-lines
soup = self.format(soup)
doc = document.Document()
if self.css_source:
self.css = CSS(self.css_source)
else:
self.css = CSS() # empty css
self.process_into(soup, doc)
return doc
def format(self, soup):
"""format a BeautifulSoup document
This will transform the block elements content from
multi-lines text into single line.
This allow us to avoid having to deal with further text
rendering once this step has been done.
"""
# Remove all the newline characters before a closing tag.
for node in soup.findAll(text=True):
if node.rstrip(" ").endswith("\n"):
node.replaceWith(node.rstrip(" ").rstrip("\n"))
# Join the block elements lines into a single long line
for tag in ['p', 'li']:
for node in soup.findAll(tag):
text = unicode(node)
lines = [x.strip() for x in text.splitlines()]
text = ' '.join(lines)
node.replaceWith(BeautifulSoup.BeautifulSoup(text))
soup = BeautifulSoup.BeautifulSoup(unicode(soup))
# replace all <br/> tag by newline character
for node in soup.findAll('br'):
node.replaceWith("\n")
soup = BeautifulSoup.BeautifulSoup(unicode(soup))
return soup
def is_bold(self, node):
"""
Return true if the BeautifulSoup node needs to be rendered as
bold.
"""
return (node.findParent(['b', 'strong']) is not None or
self.css.is_bold(node))
def is_italic(self, node):
"""
Return true if the BeautifulSoup node needs to be rendered as
italic.
"""
return (node.findParent(['em', 'i']) is not None
or self.css.is_italic(node))
def is_sub(self, node):
"""
Return true if the BeautifulSoup node needs to be rendered as
sub.
"""
return (node.findParent(['sub']) is not None
or self.css.is_sub(node))
def is_super(self, node):
"""
Return true if the BeautifulSoup node needs to be rendered as
super.
"""
return (node.findParent(['sup']) is not None
or self.css.is_super(node))
def url(self, node):
"""
return the url of a BeautifulSoup node or None if there is no
url.
"""
a_node = node.findParent('a')
if not a_node:
return None
return a_node.get('href')
def process_text(self, node):
"""
Return a pyth Text object from a BeautifulSoup node or None if
the text is empty.
"""
text = node.string.strip()
if not text:
return
# Set all the properties
properties=dict()
if self.is_bold(node):
properties['bold'] = True
if self.is_italic(node):
properties['italic'] = True
if self.url(node):
properties['url'] = self.url(node)
if self.is_sub(node):
properties['sub'] = True
if self.is_super(node):
properties['super'] = True
content=[node.string]
return document.Text(properties, content)
def process_into(self, node, obj):
"""
Process a BeautifulSoup node and fill its elements into a pyth
base object.
"""
if isinstance(node, BeautifulSoup.NavigableString):
text = self.process_text(node)
if text:
obj.append(text)
return
if node.name == 'p':
# add a new paragraph into the pyth object
new_obj = document.Paragraph()
obj.append(new_obj)
obj = new_obj
elif node.name == 'ul':
# add a new list
new_obj = document.List()
obj.append(new_obj)
obj = new_obj
elif node.name == 'li':
# add a new list entry
new_obj = document.ListEntry()
obj.append(new_obj)
obj = new_obj
for child in node:
self.process_into(child, obj)
|