From 50d03b1815bc57b989fc6c7baf56503392acc658 Mon Sep 17 00:00:00 2001 From: jlugjb Date: Fri, 28 Apr 2017 16:51:28 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E4=B8=AD=E9=97=B4=E6=9C=89?= =?UTF-8?q?=E5=A4=9A=E4=B8=AA=E7=A9=BA=E6=A0=BC=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit print __is_chapter_title(u"正文 第647章 战战和和") 使用状态机,代码更容易读和调试 --- txt2mobi/txt2html.py | 49 ++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/txt2mobi/txt2html.py b/txt2mobi/txt2html.py index d548e0a..196899a 100644 --- a/txt2mobi/txt2html.py +++ b/txt2mobi/txt2html.py @@ -151,29 +151,34 @@ def __is_chapter_title(self, line): if re.match(self.title_filter, strip_line): return True else: - if line.strip().startswith(u'第'): - if 3 < len(line.strip()) < 30 and u"第" in line and u"章" in line: - return True - if line.strip().startswith(u'第'): - if 3 < len(line.strip()) < 30 and u"第" in line and u"张" in line: - return True - if line.strip().startswith(u'正文 第'): - if 3 < len(line.strip()) < 30 and u"第" in line and u"章" in line: - return True - line = line.replace(u".", u".").replace(u":", u".") - if line.split('.')[0].isdigit(): - if 3 < len(line.strip()) < 20: - return True - if len(line) < 20 and (line.strip()[:3].isdigit() or line.strip()[:4].isdigit()): - return True - if len(line) < 40 and u"第" in line and u"卷" in line: - if line[line.index(u"第") + 1: line.index(u"卷")] in [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十"]: - return True - if line.strip().startswith(u'[第'): - if 3 < len(line.strip()) < 30 and u"第" in line and u"章" in line: - return True + flag = "content" + for ch in line: + if flag == "content" and ch == u"第": + flag = "starttitle" + continue + + if flag == "starttitle": + if (ch == " " or ch == u" "): + flag = "starttitle" + continue + elif re.match(u"([0-9一二三四五六七八九十]+)", ch): + flag = "number" + continue + else: + flag = "content" - return False + if flag == "number" : + if re.match(u"([0-9一二三四五六七八九十]+)", ch): + flag = "number" + continue + elif (ch == " " or ch == u" "): + flag = "number" + continue + if (ch == u"张" or ch == u"章" or ch == u"卷"): + return True + else: + flag = "content" + return False def process_lines(self, lines): """