diff --git a/.gitignore b/.gitignore
index 6d3932f..69525af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
*.vscode
+outputs/
*.pyc
@@ -168,4 +169,4 @@ long1.jpg
*.pdiparams.info
*.pdmodel
-.DS_Store
\ No newline at end of file
+.DS_Store
diff --git a/README.md b/README.md
index 5c01438..cee729c 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,38 @@
-# Work In Progress
-整体功能还没开发完哈!欢迎加入一起搞
+
+
+
📃 Rapid Doc
+
-## Rapid Layout Recover
+

+

+

+

+

+

+

+

-该项目主要针对文档类图像做版面还原,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。
+
-## 输入和输出
+>
+> ## 🚀 Work In Progress
+>
+> 整体功能还没开发完哈!欢迎加入一起搞
-- 输入:文档类图像
-- 输出:TXT或Word
+## 📝 简介
+
+该项目主要针对文档类图像做内容提取,将文档类图像一比一输出到Word或者Txt中,便于进一步使用或处理。后续计划支持输入PDF/图像,输出对应json格式、Txt格式、Word格式和Markdown格式。
+
+## 🛠️ 整体框架
+
+以下为整体框架依赖包,均为RapidAI出品。
-## 整体框架
+- [rapid_orientation](https://github.com/RapidAI/RapidStructure/blob/main/docs/README_Orientation.md)
+- [rapid_layout](https://github.com/RapidAI/RapidLayout)
+- [rapid_table](https://github.com/RapidAI/RapidTable) / [TableStructureRec](https://github.com/RapidAI/TableStructureRec)
+- [rapid_latex_ocr](https://github.com/RapidAI/RapidLatexOCR)
+- [rapidocr_onnxruntime](https://github.com/RapidAI/RapidOCR)
+- [rapidocr_layout_recover](https://github.com/RapidAI/RapidDoc)
```mermaid
flowchart TD
@@ -19,12 +41,40 @@ flowchart TD
G --> H[/结构化输出/]
```
-## Star History
+## 📑 输入和输出
+
+- 输入:文档类图像
+- 输出:TXT或Word
+
+## 💻 安装运行环境
+
+```bash
+pip install -r requirements.txt
+```
+
+## 🚀 运行Demo
+
+```bash
+git clone https://github.com/RapidAI/RapidDoc.git
+cd RapidDoc
+python demo.py
+```
+
+## 📈 结果示例
+
+⚠️注意:之所以提取结果没有分段,是因为版面分析模型没有段落检测功能。现有开源的所有版面分析模型都没有段落检测功能,这个后续会考虑自己训练一个版面分析模型来优化这里。
+
+
+

+
+
+
+## ⭐ Star History
-
+
-
-
-
+
+
+
diff --git a/demo.py b/demo.py
index 715fff9..d9aeebc 100644
--- a/demo.py
+++ b/demo.py
@@ -1,15 +1,28 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
-from rapid_layout_recover import RapidLayoutRecover
+from pathlib import Path
-pdf_parser = RapidLayoutRecover()
+from rapid_doc import RapidDoc
-pdf_path = "tests/test_files/direct_extract/two_column.pdf"
+pdf_parser = RapidDoc()
+
+pdf_path = "tests/test_files/scan_pdf/B0702罗马十二帝王传Page3_5.pdf"
+# pdf_path = "tests/test_files/direct_extract/single_column.pdf"
result = pdf_parser(pdf_path)
+content = []
for v in result:
txts = v[2]
for vv in txts:
print(vv[0] + "\n")
+ content.append(vv[0])
+
+save_dir = Path("outputs")
+save_dir.mkdir(parents=True, exist_ok=True)
+save_txt_path = save_dir / "1.txt"
+with open(save_txt_path, "w", encoding="utf-8") as f:
+ for v in content:
+ f.write(f"{v}\n")
+print("ok")
diff --git a/rapid_doc/__init__.py b/rapid_doc/__init__.py
new file mode 100644
index 0000000..f7852e0
--- /dev/null
+++ b/rapid_doc/__init__.py
@@ -0,0 +1,6 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+from .main import RapidDoc, RapidDocError
+
+__all__ = ["RapidDoc", "RapidDocError"]
diff --git a/rapid_layout_recover/layout_recover/__init__.py b/rapid_doc/layout_recover/__init__.py
similarity index 100%
rename from rapid_layout_recover/layout_recover/__init__.py
rename to rapid_doc/layout_recover/__init__.py
diff --git a/rapid_layout_recover/layout_recover/main.py b/rapid_doc/layout_recover/main.py
similarity index 99%
rename from rapid_layout_recover/layout_recover/main.py
rename to rapid_doc/layout_recover/main.py
index 3ab9b78..87440bc 100644
--- a/rapid_layout_recover/layout_recover/main.py
+++ b/rapid_doc/layout_recover/main.py
@@ -3,7 +3,7 @@
# @Contact: liekkaskono@163.com
import copy
import string
-from typing import List
+from typing import List, Optional
import numpy as np
import shapely
@@ -22,9 +22,10 @@ def __call__(
layout_cls_names: List[str],
ocr_boxes: np.ndarray,
ocr_rec_res: List[str],
- ratio,
+ ratio: Optional[float] = None,
):
- self.ratio = ratio
+ if ratio is not None:
+ self.ratio = ratio
# 版面分析和段落合并操作
(
diff --git a/rapid_layout_recover/main.py b/rapid_doc/main.py
similarity index 74%
rename from rapid_layout_recover/main.py
rename to rapid_doc/main.py
index f50cdda..e24b489 100644
--- a/rapid_layout_recover/main.py
+++ b/rapid_doc/main.py
@@ -2,7 +2,7 @@
# @Author: SWHL
# @Contact: liekkaskono@163.com
from pathlib import Path
-from typing import List, Union
+from typing import List, Tuple, Union
import cv2
import fitz
@@ -10,16 +10,18 @@
from rapid_layout import RapidLayout
from tqdm import tqdm
-from .direct_extract import PDFExtract
from .layout_recover import LayoutRecover
+from .ocr_extract import OCRExtract
+from .pdf_extract import PDFExtract
from .utils import which_type
-class RapidLayoutRecover:
+class RapidDoc:
def __init__(self, dpi: int = 96):
self.dpi = dpi
self.layout = RapidLayout()
self.pdf_extracter = PDFExtract()
+ self.ocr_extracter = OCRExtract()
self.layout_recover = LayoutRecover()
def __call__(self, pdf_path: Union[str, Path]):
@@ -29,10 +31,10 @@ def __call__(self, pdf_path: Union[str, Path]):
try:
file_type = which_type(pdf_path)
except (FileExistsError, TypeError) as exc:
- raise RapidLayoutRecoverError("The input content is empty.") from exc
+ raise RapidDocError("The input content is empty.") from exc
if file_type != "pdf":
- raise RapidLayoutRecoverError("The file type is not PDF format.")
+ raise RapidDocError("The file type is not PDF format.")
self.pdf_extracter.extract_all_pages(pdf_path)
@@ -42,10 +44,9 @@ def __call__(self, pdf_path: Union[str, Path]):
img = self.convert_img(page)
# 版面分析 ([x, 4], ['text', 'text', 'text', 'header'])
- layout_bboxes, layout_cls_names, _ = self.layout(img)
- layout_bboxes = layout_bboxes.cpu().numpy()
+ layout_bboxes, _, layout_cls_names, _ = self.layout(img)
- # # 可视化当前页
+ # 可视化当前页
# import copy
# tmp_img = copy.deepcopy(img)
@@ -72,12 +73,11 @@ def __call__(self, pdf_path: Union[str, Path]):
img_width = img.shape[1]
txt_boxes, txts = self.run_direct_extract(i, img_width)
else:
- # TODO
- txt_boxes, txts = self.run_ocr_extract(page)
+ txt_boxes, txts = self.run_ocr_extract(img)
# 逐页合并版面分析和文本结果
img_h, img_w = img.shape[:2]
- final_bboxes, final_txts = self.merge_layout_txts(
+ final_bboxes, final_txts = self.layout_recover(
img_h,
img_w,
layout_bboxes,
@@ -99,28 +99,16 @@ def convert_img(self, page):
def is_extract(self, page) -> bool:
return len(page.get_text()) > 100
- def run_direct_extract(self, page_num: int, img_width: int):
+ def run_direct_extract(
+ self, page_num: int, img_width: int
+ ) -> Tuple[np.ndarray, List[Tuple[str, float]]]:
txt_boxes, txts = self.pdf_extracter.extract_page_text(page_num, img_width)
return txt_boxes, txts
- def run_ocr_extract(self, page):
- return None
-
- def merge_layout_txts(
- self,
- img_h: int,
- img_w: int,
- layout_bboxes: np.ndarray,
- layout_cls_names: List[str],
- txt_boxes: np.ndarray,
- txts: List[str],
- ratio,
- ):
- txt_boxes, txts = self.layout_recover(
- img_h, img_w, layout_bboxes, layout_cls_names, txt_boxes, txts, ratio
- )
+ def run_ocr_extract(self, img: np.ndarray):
+ txt_boxes, txts = self.ocr_extracter(img)
return txt_boxes, txts
-class RapidLayoutRecoverError(Exception):
+class RapidDocError(Exception):
pass
diff --git a/rapid_layout_recover/ocr_extract/__init__.py b/rapid_doc/ocr_extract/__init__.py
similarity index 71%
rename from rapid_layout_recover/ocr_extract/__init__.py
rename to rapid_doc/ocr_extract/__init__.py
index 0ecdd4f..3db482f 100644
--- a/rapid_layout_recover/ocr_extract/__init__.py
+++ b/rapid_doc/ocr_extract/__init__.py
@@ -1,3 +1,4 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
+from .main import OCRExtract
diff --git a/rapid_doc/ocr_extract/main.py b/rapid_doc/ocr_extract/main.py
new file mode 100644
index 0000000..d62edb0
--- /dev/null
+++ b/rapid_doc/ocr_extract/main.py
@@ -0,0 +1,24 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+from typing import List, Optional, Tuple
+
+import numpy as np
+from rapidocr_onnxruntime import RapidOCR
+
+
+class OCRExtract:
+ def __init__(self):
+ self.ocr = RapidOCR()
+
+ def __call__(
+ self, img: np.ndarray
+ ) -> Optional[Tuple[np.ndarray, List[Tuple[str, float]]]]:
+ result, _ = self.ocr(img)
+ if not result:
+ return None
+
+ boxes, txts, scores = list(zip(*result))
+ boxes = np.array(boxes)
+ txts = list(zip(txts, scores))
+ return boxes, txts
diff --git a/rapid_layout_recover/direct_extract/__init__.py b/rapid_doc/pdf_extract/__init__.py
similarity index 67%
rename from rapid_layout_recover/direct_extract/__init__.py
rename to rapid_doc/pdf_extract/__init__.py
index 78158f7..c7c7836 100644
--- a/rapid_layout_recover/direct_extract/__init__.py
+++ b/rapid_doc/pdf_extract/__init__.py
@@ -1,4 +1,4 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
-from .pdf_extract import PDFExtract
+from .main import PDFExtract
diff --git a/rapid_layout_recover/direct_extract/pdf_extract.py b/rapid_doc/pdf_extract/main.py
similarity index 88%
rename from rapid_layout_recover/direct_extract/pdf_extract.py
rename to rapid_doc/pdf_extract/main.py
index df7bcf4..b56818a 100644
--- a/rapid_layout_recover/direct_extract/pdf_extract.py
+++ b/rapid_doc/pdf_extract/main.py
@@ -1,10 +1,12 @@
# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
import copy
-import re
import string
from collections import Counter
from typing import List, Optional
+import camelot
import cv2
import fitz
import numpy as np
@@ -13,19 +15,19 @@
from pdfminer.layout import LTPage, LTTextBoxHorizontal, LTTextLineHorizontal
from shapely.geometry import MultiPoint, Polygon
+from ..utils import is_contain_continous_str, only_contain_str
+
class PDFExtract:
def __init__(self):
- self.ratio = None
-
- self.texts = []
- self.table_content = []
+ self.pdf_path = None
self.pages = None
+ self.ratio = None
- def extract_all_pages(self, pdf_path):
+ def extract_all_pages(self, pdf_path: str):
self.pages = list(extract_pages(pdf_path))
- def read_pdf(self) -> List:
+ def read_pdf(self, pdf_path) -> List:
def convert_img(page):
pix = page.get_pixmap(dpi=200)
img = np.frombuffer(pix.samples, dtype=np.uint8)
@@ -33,12 +35,12 @@ def convert_img(page):
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
- with fitz.open(stream=self.pdf_path) as pdfer:
+ with fitz.open(pdf_path) as pdfer:
pdf_img_list = list(map(convert_img, pdfer))
return pdf_img_list
- def get_page_count(self):
- with fitz.open(stream=self.pdf_path) as pdfer:
+ def get_page_count(self, pdf_path):
+ with fitz.open(pdf_path) as pdfer:
return pdfer.page_count
def merge_ocr_direct(self, img, page_num, dt_boxes, rec_res):
@@ -87,8 +89,7 @@ def extract_page_text(self, page_num, ori_img_width):
if not isinstance(page, LTPage):
return np.array([])
- page_height = page.height
- texts, boxes = [], []
+ boxes, self.texts = [], []
for text_box_h in page:
if not isinstance(text_box_h, LTTextBoxHorizontal):
continue
@@ -97,24 +98,16 @@ def extract_page_text(self, page_num, ori_img_width):
if not isinstance(text_box_h_l, LTTextLineHorizontal):
continue
- # 注意这里bbox的返回值是left,bottom,right,top
- left, bottom, right, top = text_box_h_l.bbox
-
- # 注意 bottom和top是距离页面底部的坐标值,
- # 需要用当前页面高度减当前坐标值,才是以左上角为原点的坐标
- bottom = page_height - bottom
- top = page_height - top
- text = text_box_h_l.get_text()
-
- x0, y0 = left, top
- x1, y1 = right, bottom
+ x0, y0, x1, y1 = text_box_h_l.bbox
+ y0 = page.height - y0
+ y1 = page.height - y1
text = text_box_h_l.get_text()
boxes.append([[x0, y0], [x1, y0], [x1, y1], [x0, y1]])
- texts.append((text.strip(), 1.0))
+ self.texts.append(text)
self.ratio = ori_img_width / page.width
- return np.array(boxes), texts
+ return np.array(boxes)
def get_matched_boxes_rec(self, dt_boxes, direct_boxes, rec_res):
invalid_symbol_pattern = r'[$#&‘’”“(){}\[\]>?%,-./*!="+:&@]{3,}'
@@ -160,8 +153,7 @@ def process_en_text(text):
):
# SatELLItE
break
-
- if (
+ elif (
first_ele.islower()
and last_ele.islower()
and not only_contain_str(middle_eles, string.ascii_lowercase)
@@ -257,6 +249,7 @@ def extract_tables(self, page_num):
line_scale=40,
)
table_bbox = []
+ self.table_content = []
for one_table in tables:
pdf_height = one_table._image[0].shape[0] / (300 / 72)
x0, y0, x1, y1 = one_table._bbox
@@ -332,24 +325,3 @@ def _compute_poly_iou(poly1, poly2):
except shapely.geos.TopologicalError:
print("shapely.geos.TopologicalError occured, iou set to 0")
return iou
-
-
-def is_contain_continous_str(content: str, pattern: str) -> bool:
- """是否存在匹配满足pattern的连续字符"""
- match_result = re.findall(pattern, content)
- if match_result:
- return True
- return False
-
-
-def only_contain_str(src_text, given_str_list=None):
- """是否只包含given_str_list中字符
-
- :param src_text (str): 给定文本
- :param given_str_list (list): , defaults to None
- :return: bool
- """
- for value in src_text:
- if value not in given_str_list:
- return False
- return True
diff --git a/rapid_doc/utils.py b/rapid_doc/utils.py
new file mode 100644
index 0000000..50cf138
--- /dev/null
+++ b/rapid_doc/utils.py
@@ -0,0 +1,134 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import copy
+import importlib
+import re
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import List, Union
+
+import cv2
+import filetype
+import numpy as np
+
+
+def which_type(content: Union[bytes, str, Path]) -> str:
+ if isinstance(content, (str, Path)) and not Path(content).exists():
+ raise FileExistsError(f"{content} does not exist.")
+
+ kind = filetype.guess(content)
+ if kind is None:
+ raise TypeError(f"The type of {content} does not support.")
+
+ return kind.extension
+
+
+def write_txt(save_path: str, content: list, mode="w"):
+ """
+ 将list内容写入txt中
+ @param
+ content: list格式内容
+ save_path: 绝对路径str
+ @return:None
+ """
+ with open(save_path, mode, encoding="utf-8") as f:
+ for value in content:
+ if isinstance(value, str):
+ f.write(value + "\n")
+ elif isinstance(value, list):
+ for one_v in value:
+ f.write(f"{one_v[0]}\n")
+ else:
+ continue
+
+
+def remove_invalid(content_list, invalid_list):
+ return [v for i, v in enumerate(content_list) if i not in invalid_list]
+
+
+def is_contain_continous_str(content: str, pattern: str) -> bool:
+ """是否存在匹配满足pattern的连续字符"""
+ match_result = re.findall(pattern, content)
+ if match_result:
+ return True
+ return False
+
+
+def draw_text_det_res(dt_boxes, raw_im):
+ src_im = copy.deepcopy(raw_im)
+ for i, box in enumerate(dt_boxes):
+ box = np.array(box).astype(np.int32).reshape(-1, 2)
+ cv2.polylines(src_im, [box], True, color=(0, 0, 255), thickness=1)
+ cv2.putText(
+ src_im,
+ str(i),
+ (int(box[0][0]), int(box[0][1])),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 0, 0),
+ 2,
+ )
+ return src_im
+
+
+def mkdir(dir_path):
+ Path(dir_path).mkdir(parents=True, exist_ok=True)
+
+
+def get_between_day(begin_date, end_date):
+ date_list = []
+ begin_date = datetime.strptime(begin_date, "%Y-%m-%d")
+ end_date = datetime.strptime(end_date, "%Y-%m-%d")
+ while begin_date <= end_date:
+ date_str = begin_date.strftime("%Y-%m-%d")
+ date_list.append(date_str)
+ begin_date += timedelta(days=1)
+ return date_list
+
+
+def get_seconds(str_date):
+ date_time = datetime.strptime(str_date, "%Y-%m-%d")
+ timedelta_between = date_time - datetime(1900, 1, 1)
+ return timedelta_between.total_seconds()
+
+
+def import_module(module_dict):
+ imported_module = importlib.import_module(module_dict["module_dir"])
+ module_class = getattr(imported_module, module_dict["module_name"])
+ return module_class
+
+
+def get_cur_day():
+ cur_day = time.strftime("%Y-%m-%d", time.localtime(time.time()))
+ return cur_day
+
+
+def only_contain_str(src_text, given_str_list=None):
+ """是否只包含given_str_list中字符
+
+ :param src_text (str): 给定文本
+ :param given_str_list (list): , defaults to None
+ :return: bool
+ """
+ for value in src_text:
+ if value not in given_str_list:
+ return False
+ return True
+
+
+def is_contain_str(
+ src_text: Union[str, List],
+ given_str_list: Union[str, List],
+) -> bool:
+ """src_text中是否包含given_str_list中任意一个字符
+
+ Args:
+ src_text (str or list):
+ given_str_list (str or list):
+
+ Returns:
+ bool:
+ """
+ return any(i in src_text for i in given_str_list)
diff --git a/rapid_layout_recover/__init__.py b/rapid_layout_recover/__init__.py
deleted file mode 100644
index 0aad210..0000000
--- a/rapid_layout_recover/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- encoding: utf-8 -*-
-# @Author: SWHL
-# @Contact: liekkaskono@163.com
-from .main import RapidLayoutRecover, RapidLayoutRecoverError
-
-__all__ = ["RapidLayoutRecover", "RapidLayoutRecoverError"]
diff --git a/rapid_layout_recover/utils.py b/rapid_layout_recover/utils.py
deleted file mode 100644
index aec642d..0000000
--- a/rapid_layout_recover/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# -*- encoding: utf-8 -*-
-# @Author: SWHL
-# @Contact: liekkaskono@163.com
-from pathlib import Path
-from typing import Union
-
-import filetype
-
-
-def which_type(content: Union[bytes, str, Path]) -> str:
- if isinstance(content, (str, Path)) and not Path(content).exists():
- raise FileExistsError(f"{content} does not exist.")
-
- kind = filetype.guess(content)
- if kind is None:
- raise TypeError(f"The type of {content} does not support.")
-
- return kind.extension
diff --git a/test_pdf_extract.py b/test_pdf_extract.py
new file mode 100644
index 0000000..549455c
--- /dev/null
+++ b/test_pdf_extract.py
@@ -0,0 +1,13 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+from pathlib import Path
+
+from rapid_doc.pdf_extract.main import PDFExtract
+
+pdf_path = Path("tests/test_files/direct_extract/single_column.pdf")
+extract = PDFExtract(pdf_path)
+
+pdf_img_list = extract.read_pdf()
+pdf_nums = extract.get_page_count()
+print("ok")
diff --git a/tests/test_main.py b/tests/test_main.py
index 49600c8..c344414 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -9,9 +9,9 @@
sys.path.append(str(root_dir))
-from rapid_layout_recover import RapidLayoutRecover
+from rapid_doc import RapidDoc
-layout_recover = RapidLayoutRecover()
+layout_recover = RapidDoc()
test_file_dir = cur_dir / "test_files"
@@ -20,7 +20,5 @@ def test_direct_single_column():
pdf_path = test_file_dir / "direct_extract" / "single_column.pdf"
result = layout_recover(pdf_path)
-
assert len(result) == 1
- assert len(result[0][2]) == 9
assert result[0][2][0][0][:5] == "星期天早晨"