diff --git a/.gitignore b/.gitignore index 6d3932f..69525af 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.vscode +outputs/ *.pyc @@ -168,4 +169,4 @@ long1.jpg *.pdiparams.info *.pdmodel -.DS_Store \ No newline at end of file +.DS_Store diff --git a/README.md b/README.md index 5c01438..cee729c 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,38 @@ -# Work In Progress -整体功能还没开发完哈!欢迎加入一起搞 +
+
+

📃 Rapid Doc

+
-## Rapid Layout Recover + + + + + + +SemVer2.0 + -该项目主要针对文档类图像做版面还原,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。 +
-## 输入和输出 +> +> ## 🚀 Work In Progress +> +> 整体功能还没开发完哈!欢迎加入一起搞 -- 输入:文档类图像 -- 输出:TXT或Word +## 📝 简介 + +该项目主要针对文档类图像做内容提取,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。后续计划支持输入PDF/图像,输出对应json格式、Txt格式、Word格式和Markdown格式。 + +## 🛠️ 整体框架 + +以下为整体框架依赖包,均为RapidAI出品。 -## 整体框架 +- [rapid_orientation](https://github.com/RapidAI/RapidStructure/blob/main/docs/README_Orientation.md) +- [rapid_layout](https://github.com/RapidAI/RapidLayout) +- [rapid_table](https://github.com/RapidAI/RapidTable) / [TableStructureRec](https://github.com/RapidAI/TableStructureRec) +- [rapid_latex_ocr](https://github.com/RapidAI/RapidLatexOCR) +- [rapidocr_onnxruntime](https://github.com/RapidAI/RapidOCR) +- [rapidocr_layout_recover](https://github.com/RapidAI/RapidDoc) ```mermaid flowchart TD @@ -19,12 +41,40 @@ flowchart TD G --> H[/结构化输出/] ``` -## Star History +## 📑 输入和输出 + +- 输入:文档类图像 +- 输出:TXT或Word + +## 💻 安装运行环境 + +```bash +pip install -r requirements.txt +``` + +## 🚀 运行Demo + +```bash +git clone https://github.com/RapidAI/RapidDoc.git +cd RapidDoc +python demo.py +``` + +## 📈 结果示例 + +⚠️注意:之所以提取结果没有分段,是因为版面分析模型没有段落检测功能。现有开源的所有版面分析模型都没有段落检测功能,这个后续会考虑自己训练一个版面分析模型来优化这里。 + +
+ + +
+ +## ⭐ Star History - + - - - Star History Chart + + + Star History Chart diff --git a/demo.py b/demo.py index 715fff9..d9aeebc 100644 --- a/demo.py +++ b/demo.py @@ -1,15 +1,28 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com -from rapid_layout_recover import RapidLayoutRecover +from pathlib import Path -pdf_parser = RapidLayoutRecover() +from rapid_doc import RapidDoc -pdf_path = "tests/test_files/direct_extract/two_column.pdf" +pdf_parser = RapidDoc() + +pdf_path = "tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf" +# pdf_path = "tests/test_files/direct_extract/single_column.pdf" result = pdf_parser(pdf_path) +content = [] for v in result: txts = v[2] for vv in txts: print(vv[0] + "\n") + content.append(vv[0]) + +save_dir = Path("outputs") +save_dir.mkdir(parents=True, exist_ok=True) +save_txt_path = save_dir / "1.txt" +with open(save_txt_path, "w", encoding="utf-8") as f: + for v in content: + f.write(f"{v}\n") +print("ok") diff --git a/rapid_doc/__init__.py b/rapid_doc/__init__.py new file mode 100644 index 0000000..f7852e0 --- /dev/null +++ b/rapid_doc/__init__.py @@ -0,0 +1,6 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from .main import RapidDoc, RapidDocError + +__all__ = ["RapidDoc", "RapidDocError"] diff --git a/rapid_layout_recover/layout_recover/__init__.py b/rapid_doc/layout_recover/__init__.py similarity index 100% rename from rapid_layout_recover/layout_recover/__init__.py rename to rapid_doc/layout_recover/__init__.py diff --git a/rapid_layout_recover/layout_recover/main.py b/rapid_doc/layout_recover/main.py similarity index 99% rename from rapid_layout_recover/layout_recover/main.py rename to rapid_doc/layout_recover/main.py index 3ab9b78..87440bc 100644 --- a/rapid_layout_recover/layout_recover/main.py +++ b/rapid_doc/layout_recover/main.py @@ -3,7 +3,7 @@ # @Contact: liekkaskono@163.com import copy import string -from typing import List +from typing import List, Optional import numpy as np import shapely @@ -22,9 +22,10 @@ def __call__( layout_cls_names: List[str], ocr_boxes: np.ndarray, ocr_rec_res: List[str], - ratio, + ratio: Optional[float] = None, ): - self.ratio = ratio + if ratio is not None: + self.ratio = ratio # 版面分析和段落合并操作 ( diff --git a/rapid_layout_recover/main.py b/rapid_doc/main.py similarity index 74% rename from rapid_layout_recover/main.py rename to rapid_doc/main.py index f50cdda..e24b489 100644 --- a/rapid_layout_recover/main.py +++ b/rapid_doc/main.py @@ -2,7 +2,7 @@ # @Author: SWHL # @Contact: liekkaskono@163.com from pathlib import Path -from typing import List, Union +from typing import List, Tuple, Union import cv2 import fitz @@ -10,16 +10,18 @@ from rapid_layout import RapidLayout from tqdm import tqdm -from .direct_extract import PDFExtract from .layout_recover import LayoutRecover +from .ocr_extract import OCRExtract +from .pdf_extract import PDFExtract from .utils import which_type -class RapidLayoutRecover: +class RapidDoc: def __init__(self, dpi: int = 96): self.dpi = dpi self.layout = RapidLayout() self.pdf_extracter = PDFExtract() + self.ocr_extracter = OCRExtract() self.layout_recover = LayoutRecover() def __call__(self, pdf_path: Union[str, Path]): @@ -29,10 +31,10 @@ def __call__(self, pdf_path: Union[str, Path]): try: file_type = which_type(pdf_path) except (FileExistsError, TypeError) as exc: - raise RapidLayoutRecoverError("The input content is empty.") from exc + raise RapidDocError("The input content is empty.") from exc if file_type != "pdf": - raise RapidLayoutRecoverError("The file type is not PDF format.") + raise RapidDocError("The file type is not PDF format.") self.pdf_extracter.extract_all_pages(pdf_path) @@ -42,10 +44,9 @@ def __call__(self, pdf_path: Union[str, Path]): img = self.convert_img(page) # 版面分析 ([x, 4], ['text', 'text', 'text', 'header']) - layout_bboxes, layout_cls_names, _ = self.layout(img) - layout_bboxes = layout_bboxes.cpu().numpy() + layout_bboxes, _, layout_cls_names, _ = self.layout(img) - # # 可视化当前页 + # 可视化当前页 # import copy # tmp_img = copy.deepcopy(img) @@ -72,12 +73,11 @@ def __call__(self, pdf_path: Union[str, Path]): img_width = img.shape[1] txt_boxes, txts = self.run_direct_extract(i, img_width) else: - # TODO - txt_boxes, txts = self.run_ocr_extract(page) + txt_boxes, txts = self.run_ocr_extract(img) # 逐页合并版面分析和文本结果 img_h, img_w = img.shape[:2] - final_bboxes, final_txts = self.merge_layout_txts( + final_bboxes, final_txts = self.layout_recover( img_h, img_w, layout_bboxes, @@ -99,28 +99,16 @@ def convert_img(self, page): def is_extract(self, page) -> bool: return len(page.get_text()) > 100 - def run_direct_extract(self, page_num: int, img_width: int): + def run_direct_extract( + self, page_num: int, img_width: int + ) -> Tuple[np.ndarray, List[Tuple[str, float]]]: txt_boxes, txts = self.pdf_extracter.extract_page_text(page_num, img_width) return txt_boxes, txts - def run_ocr_extract(self, page): - return None - - def merge_layout_txts( - self, - img_h: int, - img_w: int, - layout_bboxes: np.ndarray, - layout_cls_names: List[str], - txt_boxes: np.ndarray, - txts: List[str], - ratio, - ): - txt_boxes, txts = self.layout_recover( - img_h, img_w, layout_bboxes, layout_cls_names, txt_boxes, txts, ratio - ) + def run_ocr_extract(self, img: np.ndarray): + txt_boxes, txts = self.ocr_extracter(img) return txt_boxes, txts -class RapidLayoutRecoverError(Exception): +class RapidDocError(Exception): pass diff --git a/rapid_layout_recover/ocr_extract/__init__.py b/rapid_doc/ocr_extract/__init__.py similarity index 71% rename from rapid_layout_recover/ocr_extract/__init__.py rename to rapid_doc/ocr_extract/__init__.py index 0ecdd4f..3db482f 100644 --- a/rapid_layout_recover/ocr_extract/__init__.py +++ b/rapid_doc/ocr_extract/__init__.py @@ -1,3 +1,4 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com +from .main import OCRExtract diff --git a/rapid_doc/ocr_extract/main.py b/rapid_doc/ocr_extract/main.py new file mode 100644 index 0000000..d62edb0 --- /dev/null +++ b/rapid_doc/ocr_extract/main.py @@ -0,0 +1,24 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from typing import List, Optional, Tuple + +import numpy as np +from rapidocr_onnxruntime import RapidOCR + + +class OCRExtract: + def __init__(self): + self.ocr = RapidOCR() + + def __call__( + self, img: np.ndarray + ) -> Optional[Tuple[np.ndarray, List[Tuple[str, float]]]]: + result, _ = self.ocr(img) + if not result: + return None + + boxes, txts, scores = list(zip(*result)) + boxes = np.array(boxes) + txts = list(zip(txts, scores)) + return boxes, txts diff --git a/rapid_layout_recover/direct_extract/__init__.py b/rapid_doc/pdf_extract/__init__.py similarity index 67% rename from rapid_layout_recover/direct_extract/__init__.py rename to rapid_doc/pdf_extract/__init__.py index 78158f7..c7c7836 100644 --- a/rapid_layout_recover/direct_extract/__init__.py +++ b/rapid_doc/pdf_extract/__init__.py @@ -1,4 +1,4 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com -from .pdf_extract import PDFExtract +from .main import PDFExtract diff --git a/rapid_layout_recover/direct_extract/pdf_extract.py b/rapid_doc/pdf_extract/main.py similarity index 88% rename from rapid_layout_recover/direct_extract/pdf_extract.py rename to rapid_doc/pdf_extract/main.py index df7bcf4..b56818a 100644 --- a/rapid_layout_recover/direct_extract/pdf_extract.py +++ b/rapid_doc/pdf_extract/main.py @@ -1,10 +1,12 @@ # -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com import copy -import re import string from collections import Counter from typing import List, Optional +import camelot import cv2 import fitz import numpy as np @@ -13,19 +15,19 @@ from pdfminer.layout import LTPage, LTTextBoxHorizontal, LTTextLineHorizontal from shapely.geometry import MultiPoint, Polygon +from ..utils import is_contain_continous_str, only_contain_str + class PDFExtract: def __init__(self): - self.ratio = None - - self.texts = [] - self.table_content = [] + self.pdf_path = None self.pages = None + self.ratio = None - def extract_all_pages(self, pdf_path): + def extract_all_pages(self, pdf_path: str): self.pages = list(extract_pages(pdf_path)) - def read_pdf(self) -> List: + def read_pdf(self, pdf_path) -> List: def convert_img(page): pix = page.get_pixmap(dpi=200) img = np.frombuffer(pix.samples, dtype=np.uint8) @@ -33,12 +35,12 @@ def convert_img(page): img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) return img - with fitz.open(stream=self.pdf_path) as pdfer: + with fitz.open(pdf_path) as pdfer: pdf_img_list = list(map(convert_img, pdfer)) return pdf_img_list - def get_page_count(self): - with fitz.open(stream=self.pdf_path) as pdfer: + def get_page_count(self, pdf_path): + with fitz.open(pdf_path) as pdfer: return pdfer.page_count def merge_ocr_direct(self, img, page_num, dt_boxes, rec_res): @@ -87,8 +89,7 @@ def extract_page_text(self, page_num, ori_img_width): if not isinstance(page, LTPage): return np.array([]) - page_height = page.height - texts, boxes = [], [] + boxes, self.texts = [], [] for text_box_h in page: if not isinstance(text_box_h, LTTextBoxHorizontal): continue @@ -97,24 +98,16 @@ def extract_page_text(self, page_num, ori_img_width): if not isinstance(text_box_h_l, LTTextLineHorizontal): continue - # 注意这里bbox的返回值是left,bottom,right,top - left, bottom, right, top = text_box_h_l.bbox - - # 注意 bottom和top是距离页面底部的坐标值, - # 需要用当前页面高度减当前坐标值,才是以左上角为原点的坐标 - bottom = page_height - bottom - top = page_height - top - text = text_box_h_l.get_text() - - x0, y0 = left, top - x1, y1 = right, bottom + x0, y0, x1, y1 = text_box_h_l.bbox + y0 = page.height - y0 + y1 = page.height - y1 text = text_box_h_l.get_text() boxes.append([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]) - texts.append((text.strip(), 1.0)) + self.texts.append(text) self.ratio = ori_img_width / page.width - return np.array(boxes), texts + return np.array(boxes) def get_matched_boxes_rec(self, dt_boxes, direct_boxes, rec_res): invalid_symbol_pattern = r'[$#&‘’”“(){}\[\]>?%,-./*!="+:&@]{3,}' @@ -160,8 +153,7 @@ def process_en_text(text): ): # SatELLItE break - - if ( + elif ( first_ele.islower() and last_ele.islower() and not only_contain_str(middle_eles, string.ascii_lowercase) @@ -257,6 +249,7 @@ def extract_tables(self, page_num): line_scale=40, ) table_bbox = [] + self.table_content = [] for one_table in tables: pdf_height = one_table._image[0].shape[0] / (300 / 72) x0, y0, x1, y1 = one_table._bbox @@ -332,24 +325,3 @@ def _compute_poly_iou(poly1, poly2): except shapely.geos.TopologicalError: print("shapely.geos.TopologicalError occured, iou set to 0") return iou - - -def is_contain_continous_str(content: str, pattern: str) -> bool: - """是否存在匹配满足pattern的连续字符""" - match_result = re.findall(pattern, content) - if match_result: - return True - return False - - -def only_contain_str(src_text, given_str_list=None): - """是否只包含given_str_list中字符 - - :param src_text (str): 给定文本 - :param given_str_list (list): , defaults to None - :return: bool - """ - for value in src_text: - if value not in given_str_list: - return False - return True diff --git a/rapid_doc/utils.py b/rapid_doc/utils.py new file mode 100644 index 0000000..50cf138 --- /dev/null +++ b/rapid_doc/utils.py @@ -0,0 +1,134 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import copy +import importlib +import re +import time +from datetime import datetime, timedelta +from pathlib import Path +from typing import List, Union + +import cv2 +import filetype +import numpy as np + + +def which_type(content: Union[bytes, str, Path]) -> str: + if isinstance(content, (str, Path)) and not Path(content).exists(): + raise FileExistsError(f"{content} does not exist.") + + kind = filetype.guess(content) + if kind is None: + raise TypeError(f"The type of {content} does not support.") + + return kind.extension + + +def write_txt(save_path: str, content: list, mode="w"): + """ + 将list内容写入txt中 + @param + content: list格式内容 + save_path: 绝对路径str + @return:None + """ + with open(save_path, mode, encoding="utf-8") as f: + for value in content: + if isinstance(value, str): + f.write(value + "\n") + elif isinstance(value, list): + for one_v in value: + f.write(f"{one_v[0]}\n") + else: + continue + + +def remove_invalid(content_list, invalid_list): + return [v for i, v in enumerate(content_list) if i not in invalid_list] + + +def is_contain_continous_str(content: str, pattern: str) -> bool: + """是否存在匹配满足pattern的连续字符""" + match_result = re.findall(pattern, content) + if match_result: + return True + return False + + +def draw_text_det_res(dt_boxes, raw_im): + src_im = copy.deepcopy(raw_im) + for i, box in enumerate(dt_boxes): + box = np.array(box).astype(np.int32).reshape(-1, 2) + cv2.polylines(src_im, [box], True, color=(0, 0, 255), thickness=1) + cv2.putText( + src_im, + str(i), + (int(box[0][0]), int(box[0][1])), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 0, 0), + 2, + ) + return src_im + + +def mkdir(dir_path): + Path(dir_path).mkdir(parents=True, exist_ok=True) + + +def get_between_day(begin_date, end_date): + date_list = [] + begin_date = datetime.strptime(begin_date, "%Y-%m-%d") + end_date = datetime.strptime(end_date, "%Y-%m-%d") + while begin_date <= end_date: + date_str = begin_date.strftime("%Y-%m-%d") + date_list.append(date_str) + begin_date += timedelta(days=1) + return date_list + + +def get_seconds(str_date): + date_time = datetime.strptime(str_date, "%Y-%m-%d") + timedelta_between = date_time - datetime(1900, 1, 1) + return timedelta_between.total_seconds() + + +def import_module(module_dict): + imported_module = importlib.import_module(module_dict["module_dir"]) + module_class = getattr(imported_module, module_dict["module_name"]) + return module_class + + +def get_cur_day(): + cur_day = time.strftime("%Y-%m-%d", time.localtime(time.time())) + return cur_day + + +def only_contain_str(src_text, given_str_list=None): + """是否只包含given_str_list中字符 + + :param src_text (str): 给定文本 + :param given_str_list (list): , defaults to None + :return: bool + """ + for value in src_text: + if value not in given_str_list: + return False + return True + + +def is_contain_str( + src_text: Union[str, List], + given_str_list: Union[str, List], +) -> bool: + """src_text中是否包含given_str_list中任意一个字符 + + Args: + src_text (str or list): + given_str_list (str or list): + + Returns: + bool: + """ + return any(i in src_text for i in given_str_list) diff --git a/rapid_layout_recover/__init__.py b/rapid_layout_recover/__init__.py deleted file mode 100644 index 0aad210..0000000 --- a/rapid_layout_recover/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- encoding: utf-8 -*- -# @Author: SWHL -# @Contact: liekkaskono@163.com -from .main import RapidLayoutRecover, RapidLayoutRecoverError - -__all__ = ["RapidLayoutRecover", "RapidLayoutRecoverError"] diff --git a/rapid_layout_recover/utils.py b/rapid_layout_recover/utils.py deleted file mode 100644 index aec642d..0000000 --- a/rapid_layout_recover/utils.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- encoding: utf-8 -*- -# @Author: SWHL -# @Contact: liekkaskono@163.com -from pathlib import Path -from typing import Union - -import filetype - - -def which_type(content: Union[bytes, str, Path]) -> str: - if isinstance(content, (str, Path)) and not Path(content).exists(): - raise FileExistsError(f"{content} does not exist.") - - kind = filetype.guess(content) - if kind is None: - raise TypeError(f"The type of {content} does not support.") - - return kind.extension diff --git a/test_pdf_extract.py b/test_pdf_extract.py new file mode 100644 index 0000000..549455c --- /dev/null +++ b/test_pdf_extract.py @@ -0,0 +1,13 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from pathlib import Path + +from rapid_doc.pdf_extract.main import PDFExtract + +pdf_path = Path("tests/test_files/direct_extract/single_column.pdf") +extract = PDFExtract(pdf_path) + +pdf_img_list = extract.read_pdf() +pdf_nums = extract.get_page_count() +print("ok") diff --git a/tests/test_main.py b/tests/test_main.py index 49600c8..c344414 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,9 +9,9 @@ sys.path.append(str(root_dir)) -from rapid_layout_recover import RapidLayoutRecover +from rapid_doc import RapidDoc -layout_recover = RapidLayoutRecover() +layout_recover = RapidDoc() test_file_dir = cur_dir / "test_files" @@ -20,7 +20,5 @@ def test_direct_single_column(): pdf_path = test_file_dir / "direct_extract" / "single_column.pdf" result = layout_recover(pdf_path) - assert len(result) == 1 - assert len(result[0][2]) == 9 assert result[0][2][0][0][:5] == "星期天早晨"