Python练习1-文档格式化成html-CFANZ编程社区

文档格式化成HTML

THML，并没有处理所有thml规则，只是处理了一部分，功能不重要，重要的是复习熟悉下Python对文档的处理细节。毕竟Python大多数给我的印象都是处理文档。代码里有很多逻辑可能不严谨，这里再次强调只是为了复习字符串以及文档操作。

vs2015编写的，返现当时中文注释导致编码错误运行失败。一共四个文档：

Python练习1-文档格式化成html_html

入口文档是markup.py。

参数 python markup.py < xxx.txt > xxx.html

执行效果和代码如下：

Python练习1-文档格式化成html_ide_02

Util.py

def lines(file):
    for line in file:yield line
    yield '\n'
 
def blocks(file):
    block = []
    for line in lines(file):
        if line.strip():   #'\n','\r','\t',' '
            block.append(line);
        elif block:
            yield ''.join(block).strip()
            block = []

Handlers.py

class Handler:
    """
    处理从Parser调用的方法的对象。
 
    这个解析器会在每个块的开始部分调用start()和end()方法，使用合适的
    块名作为参数，sub()方法会用于正则表达式替换中。当使用了'emphasis'
    这样的名字调用时，它会返回合适的替换函数。
    """
    def callback(self ,prefix ,name ,*args):
        method = getattr(self ,prefix + name ,None)
        if callable(method) : return method(*agrs)
    def start(self ,name):
        self.callback('start_',name);
    def end(self ,name):
        self.callback('end_',name);
    def sub(self ,name):
        def substitution(match):
            result = self.callback('sub_' ,name ,match)
            if result is None: match.group(0)
            return result
        return substitution
 
class HTMLRenderer(Handler):
    """
    用于生成HTML的具体处理程序
 
    THMLRenderer内容的方法都是可以通过超类处理程序的start()、
    end()和sub()方法来访问。他们实现了用于HTRML文档的基本标签。
    """
    def start_document(self):
        print('<html><head><title>...</title></head><body>')
    def end_document(self):
        print('</body></html>')
    def start_paragraph(self):
        print('<p>')
    def end_paragraph(self):
        print('</p>')
    def start_heading(self):
        print('<h2>')
    def end_heading(self):
        print('</h2>')
    def start_list(self):
        print('<ul>')
    def end_list(self):
        print('</ul>')
    def start_listitem(self):
        print('<li>')
    def end_listitem(self):
        print('</li>')
    def start_title(self):
        print('<title>')
    def end_title(self):
        print('</title>')
    def sub_emphasis(self ,match):
        return ('<em>%s</em>' % match.group(1))
    def sub_url(self ,match):
        return ('<a href = "%s">%s</a>' % (match.group(1),match.group(1)))
    def sub_mail(self ,match):
        return ('<a href="mailto:%s">%s</a>' % (match.group(1),match.group(1)))
    def feed(self ,data):
        print(data)

Rules.py

class Rule:
    """
    所有规则的基类。
    """
    def action(self ,block ,handler):
        handler.start(self.type)
        handler.feed(block)
        handler.end(self.type)
        return True
 
class HeadingRule(Rule):
    """
    标题占一行，最多70个字符，并且不以冒号结尾。
    """
    type = 'heading'
    def condition(self ,block):
        return not '\n' in block and len(block) <= 70 and not block[-1] == ':'
 
class TitleRule(HeadingRule):
    """
    题目是文档的第一个块，但前提它是大标题。
    """
    type = 'title'
    first = True
    def condition(self, block):
        if not self.first:return False
        self.first = HeadingRule.condition(self ,block)
 
class ListItemRule(Rule):
    """
    列表项是以连字符开始的段落。作为格式化的一部分，要移除连字符。
    """
    type = 'listitem'
    def condition(self ,block):
        return block[0] == '-'
    def action(self, block, handler):
        handler.start(self.type)
        handler.feed(block[1:].strip())
        handler.end(self.type)
        return True
 
class ListRule(ListItemRule):
    """
    列表从不是列表项的块和岁以后的列表项之间。在最后一个连续的列表项之后结束。
    """
    type = 'list'
    inside = False
    def condition(self ,block):
        return True
    def action(self, block, handler):
        if not self.inside and ListItemRule.condition(self ,block):
            handler.start(self.type)
            self.inside = True
        elif self.inside and not ListItemRule.condition(self ,block):
            handler.end(self.type)
            self.inside = False
        return False
 
class ParagraphRule(Rule):
    """
    段落只是其他规则并没有覆盖到的块
    """
    type = 'paragraph'
    def condition(self ,block):
        return True

Markup.py

import sys,re
from handlers import *
from util import *
from rules import *
 
class Parser:
   #"""
   #语法分析器读取文本文件、应用规则并且控制处理程序
   # """
    def __init__(self ,handler):
        self.handler = handler
        self.rules = []
        self.filters = []
    def addRule(self ,rule):
        self.rules.append(rule)
    def addFilter(self ,pattern ,name):
        def filter(bolck ,handler):
            return re.sub(pattern ,handler.sub(name),bolck)
        self.filters.append(filter)
    def parse(self ,file):
        self.handler.start('document')
        for block in blocks(file):
            for filter in self.filters:
                block = filter(block ,self.handler)
            for rule in self.rules:
                if rule.condition(block):
                    last = rule.action(block ,self.handler)
                    if last : break
        self.handler.end('document')
 
class BasicTextParser(Parser):
    """
    在构造函数中添加规则和过滤器的具体语法分析器
    """
    def __init__(self, handler):
        Parser.__init__(self ,handler)
        self.addRule(ListRule)
        self.addRule(ListItemRule)
        self.addRule(TitleRule)
        self.addRule(HeadingRule)
        self.addRule(ParagraphRule)
 
        self.addFilter(r'\*(.+?)\*' ,'emphasis')
        self.addFilter(r'(http://[\.a-zA-Z/]+)' ,'url')
        self.addFilter(r'([\.a-zA-Z])+@[\.a-zA-Z]+[a-zA-Z]+)' ,'mail')
 
handler = HTMLRenderer()
parser = BasicTextParser(handler)
 
parser.parse(sys.stdin)