-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtest.applevisionocr.py
233 lines (199 loc) · 8.26 KB
/
test.applevisionocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
# applevisionocr.py
import os
import ctypes
import logging
import argparse
from pathlib import Path
from Foundation import NSArray, NSString, NSAutoreleasePool
from AppKit import NSBitmapImageRep, NSImage
from Vision import VNImageRequestHandler, VNRecognizeTextRequest
import Quartz
from pdf2image import convert_from_path
import fitz # PyMuPDF
from PIL import Image
import io
# 定义常量
kCGRenderingIntentDefault = 0
kCGRenderingIntentAbsoluteColorimetric = 1
kCGRenderingIntentRelativeColorimetric = 2
kCGRenderingIntentPerceptual = 3
kCGRenderingIntentSaturation = 4
# 定义 ctypes 接口
lib = ctypes.CDLL('/System/Library/Frameworks/Quartz.framework/Quartz')
lib.CGDataProviderCreateWithFilename.argtypes = [ctypes.c_char_p]
lib.CGDataProviderCreateWithFilename.restype = ctypes.c_void_p
lib.CGImageCreateWithPNGDataProvider.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_bool, ctypes.c_int]
lib.CGImageCreateWithPNGDataProvider.restype = ctypes.c_void_p
lib.CGImageRelease.argtypes = [ctypes.c_void_p]
lib.CGImageRelease.restype = None
lib.CGDataProviderRelease.argtypes = [ctypes.c_void_p]
lib.CGDataProviderRelease.restype = None
# 定义固定的临时目录
TEMP_DIR = Path.home() / "Documents" / "ocr_temp"
def ensure_temp_dir():
"""
确保临时目录存在,如果不存在则创建一个新的。
"""
try:
TEMP_DIR.mkdir(parents=True, exist_ok=True)
if not os.access(TEMP_DIR, os.W_OK):
raise PermissionError(f"没有写入权限:{TEMP_DIR}")
logging.debug(f"使用临时目录:{TEMP_DIR}")
except Exception as e:
logging.error(f"无法创建或访问临时目录 {TEMP_DIR}: {e}")
raise
def preprocess_image(image_path):
"""
对图像进行预处理,例如转换为灰度图和二值化。
"""
try:
with Image.open(image_path) as img:
img = img.convert('L') # 转换为灰度图
img = img.point(lambda x: 0 if x < 128 else 255, '1') # 二值化
return img
except Exception as e:
logging.error(f"预处理图像 {image_path} 时出错:{e}")
raise
def ocr_detect_image(cgimage) -> str:
"""
使用 Vision 框架对 CGImage 进行 OCR 识别。
"""
try:
handler = VNImageRequestHandler.alloc().initWithCGImage_options_(cgimage, None)
request = VNRecognizeTextRequest.alloc().init()
request.setRecognitionLanguages_(["zh-Hans", "zh-Hant"]) # 简体中文和繁体中文
request.setUsesLanguageCorrection_(True)
success = handler.performRequests_error_([request], None)
if not success:
raise Exception("文字识别失败")
results = request.results()
if results:
recognized_text = "\n".join([result.topCandidates_(1)[0].string() for result in results])
return recognized_text
else:
return ""
except Exception as e:
logging.error(f"OCR 识别出错: {e}")
return ""
def ocr_detect_png(path: Path) -> str:
"""
对 PNG 图像进行 OCR 识别。
"""
try:
# 预处理图像
preprocessed_img = preprocess_image(path)
# 将预处理后的图像保存为临时文件
temp_path = TEMP_DIR / f"preprocessed_{path.name}"
preprocessed_img.save(temp_path)
# 使用预处理后的图像创建 NSImage
ns_image = NSImage.alloc().initWithContentsOfFile_(str(temp_path))
if ns_image is None:
raise Exception("无法加载图像")
bitmap_rep = NSBitmapImageRep.imageRepWithData_(ns_image.TIFFRepresentation())
if bitmap_rep is None:
raise Exception("无法创建位图表示")
cg_image = bitmap_rep.CGImage()
if cg_image is None:
raise Exception("无法创建 CGImage")
text = ocr_detect_image(cg_image)
# 删除预处理后的临时文件
temp_path.unlink()
return text
except Exception as e:
logging.error(f"处理图片 {path} 时出错:{e}")
return ""
def convert_pdf_to_png(pdf_path: Path, dpi: int = 300) -> list:
"""
使用 pdf2image 或 PyMuPDF 将 PDF 页面转换为 PNG 图像。
"""
image_paths = []
try:
# 首先尝试使用 pdf2image
logging.debug("尝试使用 pdf2image 转换 PDF 为 PNG")
images = convert_from_path(pdf_path, dpi=dpi, output_folder=str(TEMP_DIR))
for i, img in enumerate(images):
png_path = TEMP_DIR / f"page_{i+1}.png"
img.save(png_path, "PNG")
image_paths.append(png_path)
logging.debug(f"保存了页面 {i+1} 到 {png_path}")
except Exception as e:
logging.error(f"使用 pdf2image 转换 PDF 时出错:{e}")
logging.info("尝试使用 PyMuPDF 转换 PDF 为 PNG")
try:
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
image_path = TEMP_DIR / f"page_{page_num+1}.png"
pix.save(image_path)
image_paths.append(image_path)
logging.debug(f"保存了页面 {page_num+1} 到 {image_path}")
doc.close()
except Exception as e:
logging.error(f"使用 PyMuPDF 转换 PDF 时出错:{e}")
return image_paths
def ocr_detect_pdf(pdf_path: Path, save_txt_path: Path = None, dpi: int = 300) -> str:
"""
对 PDF 文件进行 OCR 识别,并将结果保存为 TXT 文件。
"""
logging.info(f"开始 OCR 识别 PDF 文件:{pdf_path}")
image_paths = convert_pdf_to_png(pdf_path, dpi)
if not image_paths:
logging.error("无法提取 PDF 页面")
return ""
all_text = ""
for i, image_path in enumerate(image_paths):
logging.info(f"正在处理第 {i+1} 页,共 {len(image_paths)} 页")
if not image_path.exists():
logging.error(f"图片文件不存在:{image_path}")
continue
try:
text = ocr_detect_png(image_path)
all_text += text + "\n"
except Exception as e:
logging.error(f"处理图片 {image_path} 时出错:{e}")
if save_txt_path:
try:
save_txt_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_txt_path, 'w', encoding='utf-8') as f:
f.write(all_text)
logging.info(f"OCR 识别完成,结果已保存到 {save_txt_path}")
except Exception as e:
logging.error(f"保存 OCR 结果到 {save_txt_path} 时出错:{e}")
return all_text
def main():
"""
脚本的主函数,解析参数并启动 OCR 过程。
"""
parser = argparse.ArgumentParser(description="使用 OCR 识别 PDF 文件")
parser.add_argument('-i', '--input', type=str, required=True, help="输入的 PDF 文件路径")
parser.add_argument('-o', '--output', type=str, required=True, help="输出的 TXT 文件路径")
parser.add_argument('-d', '--dpi', type=int, default=300, help="转换图片的分辨率,默认 300 DPI")
parser.add_argument('-v', '--verbose', action='store_true', help="启用详细日志")
args = parser.parse_args()
# 展开用户路径
input_path = Path(os.path.expanduser(args.input))
output_path = Path(os.path.expanduser(args.output))
# 配置日志
if args.verbose:
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')
else:
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# 确保临时目录存在
try:
ensure_temp_dir()
except Exception as e:
logging.error(f"无法准备临时目录:{e}")
return
# 开始 OCR 过程
try:
recognized_text = ocr_detect_pdf(input_path, output_path, args.dpi)
if recognized_text:
logging.info(f"OCR 识别完成,结果已保存到 {output_path}")
else:
logging.info("OCR 识别完成,但未检测到任何文本。")
except Exception as e:
logging.error(f"OCR 识别失败: {e}")
if __name__ == "__main__":
main()