0.介绍

用来提取文件内容转txt

1.创建虚拟环境

conda create -n gotocr python=3.10

2.进去虚拟环境

conda activate gotocr

3.安装需要的包

pip install torch
pip install torchvision
pip install transformers
pip install tiktoken
pip install verovio
pip install accelerate

4.进行模型下载

python
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)

5.模型保存本地

tokenizer.save_pretrained("./GOT-OCR2_0")
model.save_pretrained("./GOT-OCR2_0")

6.修改代码 

出现问题

解决问题

/home/aaa/.cache/huggingface/modules/transformers_modules/ucaslcl/GOT-OCR2_0/.../modeling_GOT.py 路径下文件的第401行修改代码

max_cache_length = past_key_values.get_max_length()  # ❌ 旧方法

max_cache_length = past_key_values.get_seq_length()  # ✅ 新方法

修改后保存->关闭命令窗口->重新打开命令窗口->进入虚拟环境

7.测试

from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('./GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('./GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()
image_file = '1.jpg'    #替换为自己的图片
res = model.chat(tokenizer, image_file, ocr_type='ocr')
print(res)

 上面内容输入的只能是jpg图片

8.拓展 

1.让模型能识别pdf

2.让模型能批量识别

3.识别内容进行保存,多页pdf分别保存在多个txt文件中

4.多页pdf结果保存在一个txt文件中

pip install pdf2image pillow
sudo apt-get install poppler-utils
from transformers import AutoModel, AutoTokenizer
from pdf2image import convert_from_path
import os
import tempfile
import shutil  # 新增模块用于文件合并

tokenizer = AutoTokenizer.from_pretrained('./GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained(
    './GOT-OCR2_0',
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    device_map='cuda',
    use_safetensors=True,
    pad_token_id=tokenizer.eos_token_id
).eval().cuda()

input_dir = 'input'
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

def process_image(image_path, output_path):
    res = model.chat(tokenizer, image_path, ocr_type='ocr')
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(res)
    print(f"Processed {os.path.basename(image_path)} -> {os.path.basename(output_path)}")

def process_pdf(pdf_path, base_output_name):
    # 创建以PDF名为标题的独立文件夹
    pdf_folder = os.path.join(output_dir, base_output_name)
    os.makedirs(pdf_folder, exist_ok=True)
    
    with tempfile.TemporaryDirectory() as temp_dir:
        images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir)
        merged_content = []  # 新增合并内容容器

        for page_num, image in enumerate(images, start=1):
            # 生成独立页面的临时图片路径
            temp_image_path = os.path.join(temp_dir, f"page_{page_num}.jpg")
            image.save(temp_image_path, "JPEG")

            # 生成页面级文本文件路径
            page_filename = f"page{page_num}.txt"
            page_path = os.path.join(pdf_folder, page_filename)
            
            # 处理并保存单页结果
            process_image(temp_image_path, page_path)
            
            # 读取当前页面内容用于后续合并
            with open(page_path, 'r', encoding='utf-8') as f:
                merged_content.append(f"=== Page {page_num} ===\n{f.read()}\n")

        # 写入合并后的总文件
        merged_file = os.path.join(pdf_folder, f"{base_output_name}.txt")
        with open(merged_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(merged_content))
        print(f"Merged {len(images)} pages into {merged_file}")

for filename in os.listdir(input_dir):
    file_path = os.path.join(input_dir, filename)
    
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        base_name = os.path.splitext(filename)[0]
        output_path = os.path.join(output_dir, f"{base_name}.txt")
        process_image(file_path, output_path)
    
    elif filename.lower().endswith('.pdf'):
        base_name = os.path.splitext(filename)[0]
        process_pdf(file_path, base_name)

print("所有文件处理完成!")

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐