|
1 | 1 | #!/usr/bin/env python
|
2 | 2 | # -*- encoding: utf-8 -*-
|
3 | 3 | """
|
4 |
| -Topic: 处理pdf2txt.py -o pc.txt /home/mango/work/perfect.pdf生成的txt文件 |
5 |
| -Desc : 最后的结果是我想要的,去除了页头和页脚的部分 |
| 4 | +Desc: 如何将原有的《Python Cookbook》3rd edition.pdf文件转换为我自己的cookbook翻译项目格式 |
| 5 | +
|
| 6 | +1. 首先使用在线PDF文件切割截取出自己想要的pdf文件部分:http://smallpdf.com/split-pdf |
| 7 | +2. 安装PDFMiner依赖,然后使用:pdf2txt.py -o pc.txt /home/mango/work/perfect.pdf生成的txt文件 |
| 8 | +3. 把生成的txt文件放到idea中,去除某些没用的符号,比如'口'字符,全局replace |
| 9 | +4. 调用beauty2()函数,去除了页头和页脚的部分 |
| 10 | +5. 调用convert_cookbook()函数将txt文件转换为cookbook项目所需的格式 |
6 | 11 | """
|
7 | 12 | import re
|
| 13 | +import os |
| 14 | +from os.path import join |
| 15 | +import logging |
| 16 | + |
| 17 | +logging.basicConfig(level=logging.INFO, |
| 18 | + format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', |
| 19 | + datefmt='%Y-%m-%d %H:%M:%S', |
| 20 | + handlers=[logging.FileHandler('d:/logs/cookbook.log', 'w', 'utf-8')]) |
| 21 | +_log = logging.getLogger('app.' + __name__) |
8 | 22 |
|
9 | 23 |
|
10 | 24 | def beauty(txt_file):
|
@@ -48,13 +62,113 @@ def beauty2(pre_txt, after_txt):
|
48 | 62 | f.writelines(result_lines)
|
49 | 63 |
|
50 | 64 |
|
51 |
| -def generate_chapter(): |
52 |
| - """ |
53 |
| - 解析文本文件,生成最终的待翻译文件 |
54 |
| - """ |
55 |
| - |
| 65 | +def convert_cookbook(txt_file, base_dir): |
| 66 | + """演示一下seek方法""" |
| 67 | + chapter = None # 章 |
| 68 | + paper = None # 节 |
| 69 | + write_file = None # 接下来要写入的文件 |
| 70 | + temp_lines = [] # 临时存放章或节内容 |
| 71 | + hit_paper = False # 是否命中小节标志 |
| 72 | + hit_offset = 0 # 命中后行距 |
| 73 | + with open(txt_file, mode='r', encoding='utf-8') as f: |
| 74 | + for line in f: |
| 75 | + c_match = re.match('^CHAPTER (\d+)$', line.strip()) |
| 76 | + p_match = re.match('^(\d+)\.(\d+)\. ', line.strip()) |
| 77 | + a_match = re.match('^APPENDIX A$', line.strip()) |
| 78 | + if c_match: |
| 79 | + old_chapter = chapter |
| 80 | + chapter = int(c_match.group(1)) |
| 81 | + if old_chapter and chapter - old_chapter != 1: |
| 82 | + _log.error('章节不连续啊: {}'.format(line.strip())) |
| 83 | + continue |
| 84 | + # 开始新的一章了 |
| 85 | + _log.info('------------------------------------------------------') |
| 86 | + _log.info('---------开始新的一章了,第{}章!-----------'.format(chapter)) |
| 87 | + # 前面的给写入文件中 |
| 88 | + if temp_lines: |
| 89 | + _log.info('write_file={}'.format(write_file)) |
| 90 | + with open(write_file, mode='r', encoding='utf-8') as wf: |
| 91 | + for i in range(7): |
| 92 | + temp_lines.insert(i, wf.readline()) |
| 93 | + with open(write_file, mode='w', encoding='utf-8') as wf: |
| 94 | + wf.writelines(temp_lines) |
| 95 | + temp_lines.clear() |
| 96 | + # 首先创建一个章节源码目录 |
| 97 | + c_dir = join(base_dir, 'cookbook', 'c{:02d}'.format(chapter)) |
| 98 | + if not os.path.exists(c_dir): |
| 99 | + os.makedirs(c_dir) |
| 100 | + # 找到章节文件 |
| 101 | + chapters_dir = join(base_dir, 'source', 'chapters') |
| 102 | + onlyfiles = [f for f in os.listdir(chapters_dir) |
| 103 | + if os.path.isfile(join(chapters_dir, f))] |
| 104 | + write_file = next(join(chapters_dir, f) for f in onlyfiles if |
| 105 | + f.startswith('p{:02d}'.format(chapter))) |
| 106 | + _log.info('找到章节文件:{}'.format(write_file)) |
| 107 | + elif p_match: |
| 108 | + hit_paper = True |
| 109 | + paper = int(p_match.group(2)) |
| 110 | + hit_offset = 0 |
| 111 | + elif hit_paper and hit_offset <= 2: |
| 112 | + if line.strip() == 'Problem': |
| 113 | + # 说明是新的一节开始了 |
| 114 | + _log.info('开始新的一节了,第{}章,第{}节!'.format(chapter, paper)) |
| 115 | + # 前面的给写入文件中 |
| 116 | + if temp_lines: |
| 117 | + if 'chapters' not in write_file: |
| 118 | + _log.info('write_file={}'.format(write_file)) |
| 119 | + with open(write_file, mode='r', encoding='utf-8') as wf: |
| 120 | + for i in range(7): |
| 121 | + temp_lines.insert(i, wf.readline()) |
| 122 | + with open(write_file, mode='w', encoding='utf-8') as wf: |
| 123 | + wf.writelines(temp_lines) |
| 124 | + temp_lines.clear() |
| 125 | + # 定义接下来要写入的节文件 |
| 126 | + paper_dir = join(base_dir, 'source', 'c{:02d}'.format(chapter)) |
| 127 | + pfs = [f for f in os.listdir(paper_dir) |
| 128 | + if os.path.isfile(join(paper_dir, f))] |
| 129 | + write_file = next( |
| 130 | + join(paper_dir, f) for f in pfs if f.startswith('p{:02d}'.format(paper))) |
| 131 | + _log.info('下次要写的小节文件:{}'.format(write_file)) |
| 132 | + # 创建小节源码文件 |
| 133 | + c_dir = join(base_dir, 'cookbook', 'c{:02d}'.format(chapter)) |
| 134 | + with open(join(c_dir, 'p{:02d}_.py'.format(paper)), 'w', |
| 135 | + encoding='utf-8') as pfile: |
| 136 | + pfile.write('#!/usr/bin/env python\n') |
| 137 | + pfile.write('# -*- encoding: utf-8 -*-\n') |
| 138 | + pfile.write('"""\n') |
| 139 | + pfile.write('Topic: \n') |
| 140 | + pfile.write('Desc : \n') |
| 141 | + pfile.write('"""\n') |
| 142 | + hit_paper = False |
| 143 | + hit_offset += 1 |
| 144 | + if hit_offset > 2: |
| 145 | + hit_paper = False |
| 146 | + elif a_match: |
| 147 | + # 前面的给写入文件中 |
| 148 | + if temp_lines: |
| 149 | + _log.info('write_file={}'.format(write_file)) |
| 150 | + with open(write_file, mode='r', encoding='utf-8') as wf: |
| 151 | + for i in range(7): |
| 152 | + temp_lines.insert(i, wf.readline()) |
| 153 | + with open(write_file, mode='w', encoding='utf-8') as wf: |
| 154 | + wf.writelines(temp_lines) |
| 155 | + temp_lines.clear() |
| 156 | + elif re.match('^Solution$', line.strip()): |
| 157 | + temp_lines.append('|\n') |
| 158 | + temp_lines.append('\n') |
| 159 | + temp_lines.append('----------\n') |
| 160 | + temp_lines.append('解决方案\n') |
| 161 | + temp_lines.append('----------\n') |
| 162 | + elif re.match('^Discussion$', line.strip()): |
| 163 | + temp_lines.append('|\n') |
| 164 | + temp_lines.append('\n') |
| 165 | + temp_lines.append('----------\n') |
| 166 | + temp_lines.append('讨论\n') |
| 167 | + temp_lines.append('----------\n') |
| 168 | + else: |
| 169 | + temp_lines.append(line) |
56 | 170 |
|
57 | 171 |
|
58 | 172 | if __name__ == '__main__':
|
59 |
| - # beauty2('pc_pre.txt', 'pc_after') |
60 |
| - pass |
| 173 | + convert_cookbook(r'D:\download\20150430\pc_after.txt' |
| 174 | + , r'D:\work\projects\gitprojects\python3-cookbook') |
0 commit comments