GOT-OCR2_0部署使用
Ubuntu下GOT-OCR2_0部署使用,用来提取文件内容转txt
·
0.介绍
用来提取文件内容转txt
1.创建虚拟环境
conda create -n gotocr python=3.10
2.进去虚拟环境
conda activate gotocr
3.安装需要的包
pip install torch
pip install torchvision
pip install transformers
pip install tiktoken
pip install verovio
pip install accelerate
4.进行模型下载
python
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
5.模型保存本地
tokenizer.save_pretrained("./GOT-OCR2_0")
model.save_pretrained("./GOT-OCR2_0")
6.修改代码
出现问题
解决问题
/home/aaa/.cache/huggingface/modules/transformers_modules/ucaslcl/GOT-OCR2_0/.../modeling_GOT.py
路径下文件的第401行修改代码
max_cache_length = past_key_values.get_max_length() # ❌ 旧方法
max_cache_length = past_key_values.get_seq_length() # ✅ 新方法
修改后保存->关闭命令窗口->重新打开命令窗口->进入虚拟环境
7.测试
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('./GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('./GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()
image_file = '1.jpg' #替换为自己的图片
res = model.chat(tokenizer, image_file, ocr_type='ocr')
print(res)
上面内容输入的只能是jpg图片
8.拓展
1.让模型能识别pdf
2.让模型能批量识别
3.识别内容进行保存,多页pdf分别保存在多个txt文件中
4.多页pdf结果保存在一个txt文件中
pip install pdf2image pillow
sudo apt-get install poppler-utils
from transformers import AutoModel, AutoTokenizer
from pdf2image import convert_from_path
import os
import tempfile
import shutil # 新增模块用于文件合并
tokenizer = AutoTokenizer.from_pretrained('./GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained(
'./GOT-OCR2_0',
trust_remote_code=True,
low_cpu_mem_usage=True,
device_map='cuda',
use_safetensors=True,
pad_token_id=tokenizer.eos_token_id
).eval().cuda()
input_dir = 'input'
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
def process_image(image_path, output_path):
res = model.chat(tokenizer, image_path, ocr_type='ocr')
with open(output_path, 'w', encoding='utf-8') as f:
f.write(res)
print(f"Processed {os.path.basename(image_path)} -> {os.path.basename(output_path)}")
def process_pdf(pdf_path, base_output_name):
# 创建以PDF名为标题的独立文件夹
pdf_folder = os.path.join(output_dir, base_output_name)
os.makedirs(pdf_folder, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir:
images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir)
merged_content = [] # 新增合并内容容器
for page_num, image in enumerate(images, start=1):
# 生成独立页面的临时图片路径
temp_image_path = os.path.join(temp_dir, f"page_{page_num}.jpg")
image.save(temp_image_path, "JPEG")
# 生成页面级文本文件路径
page_filename = f"page{page_num}.txt"
page_path = os.path.join(pdf_folder, page_filename)
# 处理并保存单页结果
process_image(temp_image_path, page_path)
# 读取当前页面内容用于后续合并
with open(page_path, 'r', encoding='utf-8') as f:
merged_content.append(f"=== Page {page_num} ===\n{f.read()}\n")
# 写入合并后的总文件
merged_file = os.path.join(pdf_folder, f"{base_output_name}.txt")
with open(merged_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(merged_content))
print(f"Merged {len(images)} pages into {merged_file}")
for filename in os.listdir(input_dir):
file_path = os.path.join(input_dir, filename)
if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
base_name = os.path.splitext(filename)[0]
output_path = os.path.join(output_dir, f"{base_name}.txt")
process_image(file_path, output_path)
elif filename.lower().endswith('.pdf'):
base_name = os.path.splitext(filename)[0]
process_pdf(file_path, base_name)
print("所有文件处理完成!")
更多推荐
所有评论(0)