From 0c9b5b6a3c1b45d2efb236c436ce9bad54b9dd2e Mon Sep 17 00:00:00 2001 From: SWHL Date: Tue, 10 Sep 2024 10:14:10 +0800 Subject: [PATCH 1/9] docs: Update README --- README.md | 47 ++++++++++++++++++++++++++++++------ demo.py | 14 ++++++++++- rapid_layout_recover/main.py | 3 +-- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5c01438..e672c32 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,21 @@ -# Work In Progress +# 🚀 Work In Progress + 整体功能还没开发完哈!欢迎加入一起搞 -## Rapid Layout Recover +## 🔍 Rapid Layout Recover 该项目主要针对文档类图像做版面还原,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。 -## 输入和输出 +## 🛠️ 整体框架 -- 输入:文档类图像 -- 输出:TXT或Word +以下为整个项目依赖包,均为RapidAI出品。 -## 整体框架 +- [rapid_orientation](https://github.com/RapidAI/RapidStructure/blob/main/docs/README_Orientation.md) +- [rapid_layout](https://github.com/RapidAI/RapidLayout) +- [rapid_table](https://github.com/RapidAI/RapidTable) +- [rapid_latex_ocr](https://github.com/RapidAI/RapidLatexOCR) +- [rapidocr_onnxruntime](https://github.com/RapidAI/RapidOCR) +- [rapidocr_layout_recover](https://github.com/RapidAI/RapidLayoutRecover) ```mermaid flowchart TD @@ -19,7 +24,35 @@ flowchart TD G --> H[/结构化输出/] ``` -## Star History +## 📑 输入和输出 + +- 输入:文档类图像 +- 输出:TXT或Word + +## 💻 安装运行环境 + +```bash +pip install -r requirements.txt +``` + +## 🚀 运行Demo + +```bash +git clone https://github.com/RapidAI/RapidLayoutRecover.git +cd RapidLayoutRecover +python demo.py +``` + +## 📈 结果示例 + +⚠️注意:之所以提取结果没有分段,是因为版面分析模型没有段落检测功能。现有开源的所有版面分析模型都没有段落检测功能,这个后续会考虑自己训练一个版面分析模型来优化这里。 + +
+ + +
+ +## ⭐ Star History diff --git a/demo.py b/demo.py index 715fff9..0f2abb0 100644 --- a/demo.py +++ b/demo.py @@ -1,15 +1,27 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com +from pathlib import Path + from rapid_layout_recover import RapidLayoutRecover pdf_parser = RapidLayoutRecover() -pdf_path = "tests/test_files/direct_extract/two_column.pdf" +pdf_path = "tests/test_files/direct_extract/single_column.pdf" result = pdf_parser(pdf_path) +content = [] for v in result: txts = v[2] for vv in txts: print(vv[0] + "\n") + content.append(vv[0]) + +save_dir = Path("outputs") +save_dir.mkdir(parents=True, exist_ok=True) +save_txt_path = save_dir / "1.txt" +with open(save_txt_path, "w", encoding="utf-8") as f: + for v in content: + f.write(f"{v}\n") +print("ok") diff --git a/rapid_layout_recover/main.py b/rapid_layout_recover/main.py index f50cdda..3940242 100644 --- a/rapid_layout_recover/main.py +++ b/rapid_layout_recover/main.py @@ -42,8 +42,7 @@ def __call__(self, pdf_path: Union[str, Path]): img = self.convert_img(page) # 版面分析 ([x, 4], ['text', 'text', 'text', 'header']) - layout_bboxes, layout_cls_names, _ = self.layout(img) - layout_bboxes = layout_bboxes.cpu().numpy() + layout_bboxes, _, layout_cls_names, _ = self.layout(img) # # 可视化当前页 # import copy From dba1ffd04ecb7263f0714508f60cd8650bf54828 Mon Sep 17 00:00:00 2001 From: SWHL Date: Tue, 10 Sep 2024 12:54:21 +0800 Subject: [PATCH 2/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e672c32..aaefc0c 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ## 🛠️ 整体框架 -以下为整个项目依赖包,均为RapidAI出品。 +以下为整体框架依赖包,均为RapidAI出品。 - [rapid_orientation](https://github.com/RapidAI/RapidStructure/blob/main/docs/README_Orientation.md) - [rapid_layout](https://github.com/RapidAI/RapidLayout) From 318aea5280f9a0559c169806eef09334597685f9 Mon Sep 17 00:00:00 2001 From: SWHL Date: Wed, 9 Oct 2024 10:19:35 +0800 Subject: [PATCH 3/9] chore: Update files --- .gitignore | 3 +- demo.py | 2 +- .../direct_extract/__init__.py | 2 +- .../{pdf_extract.py => main.py} | 4 +- rapid_layout_recover/main.py | 37 +++++++------------ rapid_layout_recover/ocr_extract/__init__.py | 1 + rapid_layout_recover/ocr_extract/main.py | 24 ++++++++++++ tests/test_main.py | 2 - 8 files changed, 45 insertions(+), 30 deletions(-) rename rapid_layout_recover/direct_extract/{pdf_extract.py => main.py} (99%) create mode 100644 rapid_layout_recover/ocr_extract/main.py diff --git a/.gitignore b/.gitignore index 6d3932f..69525af 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.vscode +outputs/ *.pyc @@ -168,4 +169,4 @@ long1.jpg *.pdiparams.info *.pdmodel -.DS_Store \ No newline at end of file +.DS_Store diff --git a/demo.py b/demo.py index 0f2abb0..fe5d579 100644 --- a/demo.py +++ b/demo.py @@ -7,7 +7,7 @@ pdf_parser = RapidLayoutRecover() -pdf_path = "tests/test_files/direct_extract/single_column.pdf" +pdf_path = "tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf" result = pdf_parser(pdf_path) diff --git a/rapid_layout_recover/direct_extract/__init__.py b/rapid_layout_recover/direct_extract/__init__.py index 78158f7..82ac5ca 100644 --- a/rapid_layout_recover/direct_extract/__init__.py +++ b/rapid_layout_recover/direct_extract/__init__.py @@ -1,4 +1,4 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com -from .pdf_extract import PDFExtract +from .main import DirectExtract diff --git a/rapid_layout_recover/direct_extract/pdf_extract.py b/rapid_layout_recover/direct_extract/main.py similarity index 99% rename from rapid_layout_recover/direct_extract/pdf_extract.py rename to rapid_layout_recover/direct_extract/main.py index df7bcf4..439dced 100644 --- a/rapid_layout_recover/direct_extract/pdf_extract.py +++ b/rapid_layout_recover/direct_extract/main.py @@ -1,4 +1,6 @@ # -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com import copy import re import string @@ -14,7 +16,7 @@ from shapely.geometry import MultiPoint, Polygon -class PDFExtract: +class DirectExtract: def __init__(self): self.ratio = None diff --git a/rapid_layout_recover/main.py b/rapid_layout_recover/main.py index 3940242..749977a 100644 --- a/rapid_layout_recover/main.py +++ b/rapid_layout_recover/main.py @@ -2,7 +2,7 @@ # @Author: SWHL # @Contact: liekkaskono@163.com from pathlib import Path -from typing import List, Union +from typing import List, Tuple, Union import cv2 import fitz @@ -10,8 +10,9 @@ from rapid_layout import RapidLayout from tqdm import tqdm -from .direct_extract import PDFExtract +from .direct_extract import DirectExtract from .layout_recover import LayoutRecover +from .ocr_extract import OCRExtract from .utils import which_type @@ -19,7 +20,8 @@ class RapidLayoutRecover: def __init__(self, dpi: int = 96): self.dpi = dpi self.layout = RapidLayout() - self.pdf_extracter = PDFExtract() + self.pdf_extracter = DirectExtract() + self.ocr_extracter = OCRExtract() self.layout_recover = LayoutRecover() def __call__(self, pdf_path: Union[str, Path]): @@ -44,7 +46,7 @@ def __call__(self, pdf_path: Union[str, Path]): # 版面分析 ([x, 4], ['text', 'text', 'text', 'header']) layout_bboxes, _, layout_cls_names, _ = self.layout(img) - # # 可视化当前页 + # 可视化当前页 # import copy # tmp_img = copy.deepcopy(img) @@ -71,12 +73,11 @@ def __call__(self, pdf_path: Union[str, Path]): img_width = img.shape[1] txt_boxes, txts = self.run_direct_extract(i, img_width) else: - # TODO - txt_boxes, txts = self.run_ocr_extract(page) + txt_boxes, txts = self.run_ocr_extract(img) # 逐页合并版面分析和文本结果 img_h, img_w = img.shape[:2] - final_bboxes, final_txts = self.merge_layout_txts( + final_bboxes, final_txts = self.layout_recover( img_h, img_w, layout_bboxes, @@ -98,26 +99,14 @@ def convert_img(self, page): def is_extract(self, page) -> bool: return len(page.get_text()) > 100 - def run_direct_extract(self, page_num: int, img_width: int): + def run_direct_extract( + self, page_num: int, img_width: int + ) -> Tuple[np.ndarray, List[Tuple[str, float]]]: txt_boxes, txts = self.pdf_extracter.extract_page_text(page_num, img_width) return txt_boxes, txts - def run_ocr_extract(self, page): - return None - - def merge_layout_txts( - self, - img_h: int, - img_w: int, - layout_bboxes: np.ndarray, - layout_cls_names: List[str], - txt_boxes: np.ndarray, - txts: List[str], - ratio, - ): - txt_boxes, txts = self.layout_recover( - img_h, img_w, layout_bboxes, layout_cls_names, txt_boxes, txts, ratio - ) + def run_ocr_extract(self, img: np.ndarray): + txt_boxes, txts = self.ocr_extracter(img) return txt_boxes, txts diff --git a/rapid_layout_recover/ocr_extract/__init__.py b/rapid_layout_recover/ocr_extract/__init__.py index 0ecdd4f..3db482f 100644 --- a/rapid_layout_recover/ocr_extract/__init__.py +++ b/rapid_layout_recover/ocr_extract/__init__.py @@ -1,3 +1,4 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com +from .main import OCRExtract diff --git a/rapid_layout_recover/ocr_extract/main.py b/rapid_layout_recover/ocr_extract/main.py new file mode 100644 index 0000000..d62edb0 --- /dev/null +++ b/rapid_layout_recover/ocr_extract/main.py @@ -0,0 +1,24 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from typing import List, Optional, Tuple + +import numpy as np +from rapidocr_onnxruntime import RapidOCR + + +class OCRExtract: + def __init__(self): + self.ocr = RapidOCR() + + def __call__( + self, img: np.ndarray + ) -> Optional[Tuple[np.ndarray, List[Tuple[str, float]]]]: + result, _ = self.ocr(img) + if not result: + return None + + boxes, txts, scores = list(zip(*result)) + boxes = np.array(boxes) + txts = list(zip(txts, scores)) + return boxes, txts diff --git a/tests/test_main.py b/tests/test_main.py index 49600c8..4fdcf9d 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -20,7 +20,5 @@ def test_direct_single_column(): pdf_path = test_file_dir / "direct_extract" / "single_column.pdf" result = layout_recover(pdf_path) - assert len(result) == 1 - assert len(result[0][2]) == 9 assert result[0][2][0][0][:5] == "星期天早晨" From eee1e37a4674482d9ab4927c98e8fef4b3e9c995 Mon Sep 17 00:00:00 2001 From: SWHL Date: Thu, 10 Oct 2024 22:27:55 +0800 Subject: [PATCH 4/9] chore: Update files --- demo.py | 1 + rapid_layout_recover/layout_recover/main.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/demo.py b/demo.py index fe5d579..7ca4765 100644 --- a/demo.py +++ b/demo.py @@ -8,6 +8,7 @@ pdf_parser = RapidLayoutRecover() pdf_path = "tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf" +# pdf_path = "tests/test_files/direct_extract/single_column.pdf" result = pdf_parser(pdf_path) diff --git a/rapid_layout_recover/layout_recover/main.py b/rapid_layout_recover/layout_recover/main.py index 3ab9b78..87440bc 100644 --- a/rapid_layout_recover/layout_recover/main.py +++ b/rapid_layout_recover/layout_recover/main.py @@ -3,7 +3,7 @@ # @Contact: liekkaskono@163.com import copy import string -from typing import List +from typing import List, Optional import numpy as np import shapely @@ -22,9 +22,10 @@ def __call__( layout_cls_names: List[str], ocr_boxes: np.ndarray, ocr_rec_res: List[str], - ratio, + ratio: Optional[float] = None, ): - self.ratio = ratio + if ratio is not None: + self.ratio = ratio # 版面分析和段落合并操作 ( From 6da47d7474a9b098799c74eeae325c2197c76af2 Mon Sep 17 00:00:00 2001 From: SWHL Date: Fri, 11 Oct 2024 09:42:11 +0800 Subject: [PATCH 5/9] chore: Change repo name --- README.md | 4 ++-- demo.py | 2 +- {rapid_layout_recover => rapid_doc}/__init__.py | 0 .../direct_extract/__init__.py | 0 {rapid_layout_recover => rapid_doc}/direct_extract/main.py | 0 .../layout_recover/__init__.py | 0 {rapid_layout_recover => rapid_doc}/layout_recover/main.py | 0 {rapid_layout_recover => rapid_doc}/main.py | 0 {rapid_layout_recover => rapid_doc}/ocr_extract/__init__.py | 0 {rapid_layout_recover => rapid_doc}/ocr_extract/main.py | 0 {rapid_layout_recover => rapid_doc}/utils.py | 0 tests/test_main.py | 2 +- 12 files changed, 4 insertions(+), 4 deletions(-) rename {rapid_layout_recover => rapid_doc}/__init__.py (100%) rename {rapid_layout_recover => rapid_doc}/direct_extract/__init__.py (100%) rename {rapid_layout_recover => rapid_doc}/direct_extract/main.py (100%) rename {rapid_layout_recover => rapid_doc}/layout_recover/__init__.py (100%) rename {rapid_layout_recover => rapid_doc}/layout_recover/main.py (100%) rename {rapid_layout_recover => rapid_doc}/main.py (100%) rename {rapid_layout_recover => rapid_doc}/ocr_extract/__init__.py (100%) rename {rapid_layout_recover => rapid_doc}/ocr_extract/main.py (100%) rename {rapid_layout_recover => rapid_doc}/utils.py (100%) diff --git a/README.md b/README.md index aaefc0c..a0ed8be 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ 整体功能还没开发完哈!欢迎加入一起搞 -## 🔍 Rapid Layout Recover +## 🔍 Rapid Doc -该项目主要针对文档类图像做版面还原,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。 +该项目主要针对文档类图像做内容提取,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。后续计划支持输入PDF/图像,输出对应json格式、Txt格式、Word格式和Markdown格式。 ## 🛠️ 整体框架 diff --git a/demo.py b/demo.py index 7ca4765..447faea 100644 --- a/demo.py +++ b/demo.py @@ -3,7 +3,7 @@ # @Contact: liekkaskono@163.com from pathlib import Path -from rapid_layout_recover import RapidLayoutRecover +from rapid_doc import RapidLayoutRecover pdf_parser = RapidLayoutRecover() diff --git a/rapid_layout_recover/__init__.py b/rapid_doc/__init__.py similarity index 100% rename from rapid_layout_recover/__init__.py rename to rapid_doc/__init__.py diff --git a/rapid_layout_recover/direct_extract/__init__.py b/rapid_doc/direct_extract/__init__.py similarity index 100% rename from rapid_layout_recover/direct_extract/__init__.py rename to rapid_doc/direct_extract/__init__.py diff --git a/rapid_layout_recover/direct_extract/main.py b/rapid_doc/direct_extract/main.py similarity index 100% rename from rapid_layout_recover/direct_extract/main.py rename to rapid_doc/direct_extract/main.py diff --git a/rapid_layout_recover/layout_recover/__init__.py b/rapid_doc/layout_recover/__init__.py similarity index 100% rename from rapid_layout_recover/layout_recover/__init__.py rename to rapid_doc/layout_recover/__init__.py diff --git a/rapid_layout_recover/layout_recover/main.py b/rapid_doc/layout_recover/main.py similarity index 100% rename from rapid_layout_recover/layout_recover/main.py rename to rapid_doc/layout_recover/main.py diff --git a/rapid_layout_recover/main.py b/rapid_doc/main.py similarity index 100% rename from rapid_layout_recover/main.py rename to rapid_doc/main.py diff --git a/rapid_layout_recover/ocr_extract/__init__.py b/rapid_doc/ocr_extract/__init__.py similarity index 100% rename from rapid_layout_recover/ocr_extract/__init__.py rename to rapid_doc/ocr_extract/__init__.py diff --git a/rapid_layout_recover/ocr_extract/main.py b/rapid_doc/ocr_extract/main.py similarity index 100% rename from rapid_layout_recover/ocr_extract/main.py rename to rapid_doc/ocr_extract/main.py diff --git a/rapid_layout_recover/utils.py b/rapid_doc/utils.py similarity index 100% rename from rapid_layout_recover/utils.py rename to rapid_doc/utils.py diff --git a/tests/test_main.py b/tests/test_main.py index 4fdcf9d..b26e3dd 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,7 +9,7 @@ sys.path.append(str(root_dir)) -from rapid_layout_recover import RapidLayoutRecover +from rapid_doc import RapidLayoutRecover layout_recover = RapidLayoutRecover() From d7da8b703622fbda4b20a8462c884fb7b3be79de Mon Sep 17 00:00:00 2001 From: SWHL Date: Fri, 11 Oct 2024 09:45:35 +0800 Subject: [PATCH 6/9] docs: Update README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a0ed8be..bb8c769 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ python demo.py ⚠️注意:之所以提取结果没有分段,是因为版面分析模型没有段落检测功能。现有开源的所有版面分析模型都没有段落检测功能,这个后续会考虑自己训练一个版面分析模型来优化这里。
- +
@@ -56,8 +56,8 @@ python demo.py
- - - Star History Chart + + + Star History Chart From 59dd3fc4e7e6520570395b030fdb0af999e41432 Mon Sep 17 00:00:00 2001 From: SWHL Date: Fri, 11 Oct 2024 22:22:16 +0800 Subject: [PATCH 7/9] chore: Update files --- README.md | 8 +- demo.py | 4 +- rapid_doc/__init__.py | 4 +- rapid_doc/main.py | 12 +- .../__init__.py | 2 +- .../{direct_extract => pdf_extract}/main.py | 68 +++------- rapid_doc/utils.py | 118 +++++++++++++++++- test_pdf_extract.py | 13 ++ tests/test_main.py | 4 +- 9 files changed, 166 insertions(+), 67 deletions(-) rename rapid_doc/{direct_extract => pdf_extract}/__init__.py (69%) rename rapid_doc/{direct_extract => pdf_extract}/main.py (87%) create mode 100644 test_pdf_extract.py diff --git a/README.md b/README.md index bb8c769..9f2df8c 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ - [rapid_table](https://github.com/RapidAI/RapidTable) - [rapid_latex_ocr](https://github.com/RapidAI/RapidLatexOCR) - [rapidocr_onnxruntime](https://github.com/RapidAI/RapidOCR) -- [rapidocr_layout_recover](https://github.com/RapidAI/RapidLayoutRecover) +- [rapidocr_layout_recover](https://github.com/RapidAI/RapidDoc) ```mermaid flowchart TD @@ -38,8 +38,8 @@ pip install -r requirements.txt ## 🚀 运行Demo ```bash -git clone https://github.com/RapidAI/RapidLayoutRecover.git -cd RapidLayoutRecover +git clone https://github.com/RapidAI/RapidDoc.git +cd RapidDoc python demo.py ``` @@ -54,7 +54,7 @@ python demo.py ## ⭐ Star History - + diff --git a/demo.py b/demo.py index 447faea..d9aeebc 100644 --- a/demo.py +++ b/demo.py @@ -3,9 +3,9 @@ # @Contact: liekkaskono@163.com from pathlib import Path -from rapid_doc import RapidLayoutRecover +from rapid_doc import RapidDoc -pdf_parser = RapidLayoutRecover() +pdf_parser = RapidDoc() pdf_path = "tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf" # pdf_path = "tests/test_files/direct_extract/single_column.pdf" diff --git a/rapid_doc/__init__.py b/rapid_doc/__init__.py index 0aad210..f7852e0 100644 --- a/rapid_doc/__init__.py +++ b/rapid_doc/__init__.py @@ -1,6 +1,6 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com -from .main import RapidLayoutRecover, RapidLayoutRecoverError +from .main import RapidDoc, RapidDocError -__all__ = ["RapidLayoutRecover", "RapidLayoutRecoverError"] +__all__ = ["RapidDoc", "RapidDocError"] diff --git a/rapid_doc/main.py b/rapid_doc/main.py index 749977a..e24b489 100644 --- a/rapid_doc/main.py +++ b/rapid_doc/main.py @@ -10,17 +10,17 @@ from rapid_layout import RapidLayout from tqdm import tqdm -from .direct_extract import DirectExtract from .layout_recover import LayoutRecover from .ocr_extract import OCRExtract +from .pdf_extract import PDFExtract from .utils import which_type -class RapidLayoutRecover: +class RapidDoc: def __init__(self, dpi: int = 96): self.dpi = dpi self.layout = RapidLayout() - self.pdf_extracter = DirectExtract() + self.pdf_extracter = PDFExtract() self.ocr_extracter = OCRExtract() self.layout_recover = LayoutRecover() @@ -31,10 +31,10 @@ def __call__(self, pdf_path: Union[str, Path]): try: file_type = which_type(pdf_path) except (FileExistsError, TypeError) as exc: - raise RapidLayoutRecoverError("The input content is empty.") from exc + raise RapidDocError("The input content is empty.") from exc if file_type != "pdf": - raise RapidLayoutRecoverError("The file type is not PDF format.") + raise RapidDocError("The file type is not PDF format.") self.pdf_extracter.extract_all_pages(pdf_path) @@ -110,5 +110,5 @@ def run_ocr_extract(self, img: np.ndarray): return txt_boxes, txts -class RapidLayoutRecoverError(Exception): +class RapidDocError(Exception): pass diff --git a/rapid_doc/direct_extract/__init__.py b/rapid_doc/pdf_extract/__init__.py similarity index 69% rename from rapid_doc/direct_extract/__init__.py rename to rapid_doc/pdf_extract/__init__.py index 82ac5ca..c7c7836 100644 --- a/rapid_doc/direct_extract/__init__.py +++ b/rapid_doc/pdf_extract/__init__.py @@ -1,4 +1,4 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com -from .main import DirectExtract +from .main import PDFExtract diff --git a/rapid_doc/direct_extract/main.py b/rapid_doc/pdf_extract/main.py similarity index 87% rename from rapid_doc/direct_extract/main.py rename to rapid_doc/pdf_extract/main.py index 439dced..b56818a 100644 --- a/rapid_doc/direct_extract/main.py +++ b/rapid_doc/pdf_extract/main.py @@ -2,11 +2,11 @@ # @Author: SWHL # @Contact: liekkaskono@163.com import copy -import re import string from collections import Counter from typing import List, Optional +import camelot import cv2 import fitz import numpy as np @@ -15,19 +15,19 @@ from pdfminer.layout import LTPage, LTTextBoxHorizontal, LTTextLineHorizontal from shapely.geometry import MultiPoint, Polygon +from ..utils import is_contain_continous_str, only_contain_str -class DirectExtract: - def __init__(self): - self.ratio = None - self.texts = [] - self.table_content = [] +class PDFExtract: + def __init__(self): + self.pdf_path = None self.pages = None + self.ratio = None - def extract_all_pages(self, pdf_path): + def extract_all_pages(self, pdf_path: str): self.pages = list(extract_pages(pdf_path)) - def read_pdf(self) -> List: + def read_pdf(self, pdf_path) -> List: def convert_img(page): pix = page.get_pixmap(dpi=200) img = np.frombuffer(pix.samples, dtype=np.uint8) @@ -35,12 +35,12 @@ def convert_img(page): img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) return img - with fitz.open(stream=self.pdf_path) as pdfer: + with fitz.open(pdf_path) as pdfer: pdf_img_list = list(map(convert_img, pdfer)) return pdf_img_list - def get_page_count(self): - with fitz.open(stream=self.pdf_path) as pdfer: + def get_page_count(self, pdf_path): + with fitz.open(pdf_path) as pdfer: return pdfer.page_count def merge_ocr_direct(self, img, page_num, dt_boxes, rec_res): @@ -89,8 +89,7 @@ def extract_page_text(self, page_num, ori_img_width): if not isinstance(page, LTPage): return np.array([]) - page_height = page.height - texts, boxes = [], [] + boxes, self.texts = [], [] for text_box_h in page: if not isinstance(text_box_h, LTTextBoxHorizontal): continue @@ -99,24 +98,16 @@ def extract_page_text(self, page_num, ori_img_width): if not isinstance(text_box_h_l, LTTextLineHorizontal): continue - # 注意这里bbox的返回值是left,bottom,right,top - left, bottom, right, top = text_box_h_l.bbox - - # 注意 bottom和top是距离页面底部的坐标值, - # 需要用当前页面高度减当前坐标值,才是以左上角为原点的坐标 - bottom = page_height - bottom - top = page_height - top - text = text_box_h_l.get_text() - - x0, y0 = left, top - x1, y1 = right, bottom + x0, y0, x1, y1 = text_box_h_l.bbox + y0 = page.height - y0 + y1 = page.height - y1 text = text_box_h_l.get_text() boxes.append([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]) - texts.append((text.strip(), 1.0)) + self.texts.append(text) self.ratio = ori_img_width / page.width - return np.array(boxes), texts + return np.array(boxes) def get_matched_boxes_rec(self, dt_boxes, direct_boxes, rec_res): invalid_symbol_pattern = r'[$#&‘’”“(){}\[\]>?%,-./*!="+:&@]{3,}' @@ -162,8 +153,7 @@ def process_en_text(text): ): # SatELLItE break - - if ( + elif ( first_ele.islower() and last_ele.islower() and not only_contain_str(middle_eles, string.ascii_lowercase) @@ -259,6 +249,7 @@ def extract_tables(self, page_num): line_scale=40, ) table_bbox = [] + self.table_content = [] for one_table in tables: pdf_height = one_table._image[0].shape[0] / (300 / 72) x0, y0, x1, y1 = one_table._bbox @@ -334,24 +325,3 @@ def _compute_poly_iou(poly1, poly2): except shapely.geos.TopologicalError: print("shapely.geos.TopologicalError occured, iou set to 0") return iou - - -def is_contain_continous_str(content: str, pattern: str) -> bool: - """是否存在匹配满足pattern的连续字符""" - match_result = re.findall(pattern, content) - if match_result: - return True - return False - - -def only_contain_str(src_text, given_str_list=None): - """是否只包含given_str_list中字符 - - :param src_text (str): 给定文本 - :param given_str_list (list): , defaults to None - :return: bool - """ - for value in src_text: - if value not in given_str_list: - return False - return True diff --git a/rapid_doc/utils.py b/rapid_doc/utils.py index aec642d..50cf138 100644 --- a/rapid_doc/utils.py +++ b/rapid_doc/utils.py @@ -1,10 +1,17 @@ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com +import copy +import importlib +import re +import time +from datetime import datetime, timedelta from pathlib import Path -from typing import Union +from typing import List, Union +import cv2 import filetype +import numpy as np def which_type(content: Union[bytes, str, Path]) -> str: @@ -16,3 +23,112 @@ def which_type(content: Union[bytes, str, Path]) -> str: raise TypeError(f"The type of {content} does not support.") return kind.extension + + +def write_txt(save_path: str, content: list, mode="w"): + """ + 将list内容写入txt中 + @param + content: list格式内容 + save_path: 绝对路径str + @return:None + """ + with open(save_path, mode, encoding="utf-8") as f: + for value in content: + if isinstance(value, str): + f.write(value + "\n") + elif isinstance(value, list): + for one_v in value: + f.write(f"{one_v[0]}\n") + else: + continue + + +def remove_invalid(content_list, invalid_list): + return [v for i, v in enumerate(content_list) if i not in invalid_list] + + +def is_contain_continous_str(content: str, pattern: str) -> bool: + """是否存在匹配满足pattern的连续字符""" + match_result = re.findall(pattern, content) + if match_result: + return True + return False + + +def draw_text_det_res(dt_boxes, raw_im): + src_im = copy.deepcopy(raw_im) + for i, box in enumerate(dt_boxes): + box = np.array(box).astype(np.int32).reshape(-1, 2) + cv2.polylines(src_im, [box], True, color=(0, 0, 255), thickness=1) + cv2.putText( + src_im, + str(i), + (int(box[0][0]), int(box[0][1])), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 0, 0), + 2, + ) + return src_im + + +def mkdir(dir_path): + Path(dir_path).mkdir(parents=True, exist_ok=True) + + +def get_between_day(begin_date, end_date): + date_list = [] + begin_date = datetime.strptime(begin_date, "%Y-%m-%d") + end_date = datetime.strptime(end_date, "%Y-%m-%d") + while begin_date <= end_date: + date_str = begin_date.strftime("%Y-%m-%d") + date_list.append(date_str) + begin_date += timedelta(days=1) + return date_list + + +def get_seconds(str_date): + date_time = datetime.strptime(str_date, "%Y-%m-%d") + timedelta_between = date_time - datetime(1900, 1, 1) + return timedelta_between.total_seconds() + + +def import_module(module_dict): + imported_module = importlib.import_module(module_dict["module_dir"]) + module_class = getattr(imported_module, module_dict["module_name"]) + return module_class + + +def get_cur_day(): + cur_day = time.strftime("%Y-%m-%d", time.localtime(time.time())) + return cur_day + + +def only_contain_str(src_text, given_str_list=None): + """是否只包含given_str_list中字符 + + :param src_text (str): 给定文本 + :param given_str_list (list): , defaults to None + :return: bool + """ + for value in src_text: + if value not in given_str_list: + return False + return True + + +def is_contain_str( + src_text: Union[str, List], + given_str_list: Union[str, List], +) -> bool: + """src_text中是否包含given_str_list中任意一个字符 + + Args: + src_text (str or list): + given_str_list (str or list): + + Returns: + bool: + """ + return any(i in src_text for i in given_str_list) diff --git a/test_pdf_extract.py b/test_pdf_extract.py new file mode 100644 index 0000000..549455c --- /dev/null +++ b/test_pdf_extract.py @@ -0,0 +1,13 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from pathlib import Path + +from rapid_doc.pdf_extract.main import PDFExtract + +pdf_path = Path("tests/test_files/direct_extract/single_column.pdf") +extract = PDFExtract(pdf_path) + +pdf_img_list = extract.read_pdf() +pdf_nums = extract.get_page_count() +print("ok") diff --git a/tests/test_main.py b/tests/test_main.py index b26e3dd..c344414 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,9 +9,9 @@ sys.path.append(str(root_dir)) -from rapid_doc import RapidLayoutRecover +from rapid_doc import RapidDoc -layout_recover = RapidLayoutRecover() +layout_recover = RapidDoc() test_file_dir = cur_dir / "test_files" From 8a94199436908e22ba512b6ea9fef88bd9a6064f Mon Sep 17 00:00:00 2001 From: SWHL Date: Fri, 1 Nov 2024 08:25:51 +0800 Subject: [PATCH 8/9] docs: update READEME --- README.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9f2df8c..c41d026 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,25 @@ -# 🚀 Work In Progress +
+
+

📃 Rapid Doc

+
+ +
+ + + + + +SemVer2.0 + -整体功能还没开发完哈!欢迎加入一起搞 +
+ +> +> ## 🚀 Work In Progress +> +> 整体功能还没开发完哈!欢迎加入一起搞 -## 🔍 Rapid Doc +## 📝 简介 该项目主要针对文档类图像做内容提取,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。后续计划支持输入PDF/图像,输出对应json格式、Txt格式、Word格式和Markdown格式。 From 5e5fef5e7d02bb646111e063e041566318596c50 Mon Sep 17 00:00:00 2001 From: SWHL Date: Fri, 1 Nov 2024 08:27:38 +0800 Subject: [PATCH 9/9] docs: add table structure rec --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c41d026..cee729c 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ - [rapid_orientation](https://github.com/RapidAI/RapidStructure/blob/main/docs/README_Orientation.md) - [rapid_layout](https://github.com/RapidAI/RapidLayout) -- [rapid_table](https://github.com/RapidAI/RapidTable) +- [rapid_table](https://github.com/RapidAI/RapidTable) / [TableStructureRec](https://github.com/RapidAI/TableStructureRec) - [rapid_latex_ocr](https://github.com/RapidAI/RapidLatexOCR) - [rapidocr_onnxruntime](https://github.com/RapidAI/RapidOCR) - [rapidocr_layout_recover](https://github.com/RapidAI/RapidDoc)