前言

一款电脑智能助手肯定是需要能够拥有看见屏幕的“眼睛”,而最基础的视觉就是使用OCR使得这个助手能识别屏幕上的文字。

接一下上一篇博客的内容,在原有的基础上,我将智谱模型换成了deepseek,最开始选择智谱的原因是因为免费,并且deepseek账号注册第一个月不让充钱(第一次见送钱不要的!!)然后之前不是说ai识别控制指令不稳定嘛,我发现换成deepseek暂时没有遇见这个问题,原来是智慧问题嘛(实在是我想它给我OCR的控制指令非常不稳定,所以就坚决换了。)

问题总结

  • 版本冲突问题,PaddleOCR需要搭配paddlepaddle一起安装,这两个的版本必须要注意,我目前用的版本是paddleocr-3.3.2,paddlepaddle-gpu-2.6.2,paddlepaddle-3.0.0。如果paddlepaddle使用3.3.0版本会由于版本较新,有部分内容paddlerORC还没有兼容导致运行报错。如果paddlepaddle使用2.7.0左右版本会需要搭配1.x的numpy,而我环境中numpy是2.x,如果降低版本会由于太旧与整个环境冲突,出现问题。
paddleocr>=3.3.2
paddlepaddle-gpu>=2.6.2
paddlepaddle>=3.0.0
  • API调用问题,由于我是面向AI编程,然后AI就疯狂乱调用PaddleOCR的API,实在是受不了,还是乖乖去官网上看文档了。看了文档才发现,官方分了文字识别和文字检测,简单来说,文字识别是识别文字是什么但是没有文字的位置,而文字检测是识别有文字的位置而没有文字是什么,那么很正常就想到将两者进行结合使用。(当然,似乎官方也提供了结合的API)

文本检测模块官方调用代码

from paddleocr import TextDetection

model = TextDetection()
output = model.predict("general_ocr_001.png")
for res in output:
    res.print()
    res.save_to_img(save_path="./output/")
    res.save_to_json(save_path="./output/res.json")

文本识别模块官方调用代码

from paddleocr import TextRecognition

model = TextRecognition()
output = model.predict(input="general_ocr_rec_001.png")
for res in output:
    res.print()
    res.save_to_img(save_path="./output/")
    res.save_to_json(save_path="./output/res.json")

代码很简单,这里就不解释了,我暂时的使用方法是先使用文本检测得到屏幕内所有文字的位置,然后对置信度最高的那个文字进行文本识别。(当然,完全可以全部进行文本识别或者先框规定区域再进行文本检测,这些后续再处理)

效果展示

在这里插入图片描述

核心代码

OCRTool类模块

import os
import time
from typing import List, Dict, Optional, Union, Tuple
import numpy as np
from PIL import Image, ImageGrab, ImageFont, ImageDraw
import cv2

try:
    from paddleocr import PaddleOCR, TextDetection

    PADDLEOCR_AVAILABLE = True
except ImportError:
    PADDLEOCR_AVAILABLE = False
    print("警告: PaddleOCR未安装,请运行: pip install paddleocr paddlepaddle")


class OCRTool:
    """OCR工具类,负责所有OCR相关操作"""

    def __init__(self,
                 use_gpu: bool = False,
                 lang: str = 'ch',
                 show_log: bool = False,
                 model_dirs: Optional[Dict] = None,
                 logger=None):
        """
        初始化OCR引擎

        Args:
            use_gpu: 是否使用GPU加速
            lang: 识别语言 ('ch'中文, 'en'英文, 'multi'多语言)
            show_log: 是否显示详细日志
            model_dirs: 自定义模型路径
            logger: 可选的日志记录器
        """
        self.use_gpu = use_gpu
        self.lang = lang
        self.show_log = show_log
        self.model_dirs = model_dirs or {}
        self.logger = logger

        # 初始化OCR引擎
        self.ocr_engine = self._init_ocr_engine()

        # 性能统计
        self.stats = {
            'total_requests': 0,
            'total_characters': 0,
            'avg_process_time': 0,
            'last_operation_time': None,
            'success_count': 0,
            'error_count': 0
        }

        self._log_info(f"OCRTool初始化完成 (语言: {lang}, GPU: {use_gpu})")

    def _init_ocr_engine(self):
        """智能初始化PaddleOCR,测试所有可能的参数组合"""
        if not PADDLEOCR_AVAILABLE:
            raise ImportError("PaddleOCR未安装,请运行: pip install paddleocr paddlepaddle")

        try:
            ocr = TextDetection(model_name="PP-OCRv5_server_det")
            return ocr
        except Exception as e:
            self._log_info("初始化OCR识别")

        # 如果所有组合都失败,尝试导入检查
        self._log_error("所有参数组合都失败,尝试诊断问题...")

        # 检查PaddlePaddle安装
        try:
            import paddle
            self._log_info(f"PaddlePaddle版本: {paddle.__version__}")
            self._log_info(f"是否编译CUDA: {paddle.device.is_compiled_with_cuda()}")
        except Exception as e:
            self._log_error(f"PaddlePaddle检查失败: {e}")

        raise Exception("OCR引擎初始化失败:所有参数组合都无效")

    def recognize_from_file(self,
                            image_path: str,
                            save_result: bool = False,
                            output_dir: str = "./data/OCR_Results",
                            recognize_best_box: bool = True) -> Dict:
        """
        从图片文件识别文字

        Args:
            image_path: 图片路径
            save_result: 是否保存识别结果
            output_dir: 结果保存目录

        Returns:
            识别结果字典
        """
        start_time = time.time()

        try:
            self._log_debug(f"开始识别文件: {image_path}")

            # 检查文件是否存在
            if not os.path.exists(image_path):
                self._log_error(f"图片文件不存在: {image_path}")
                raise FileNotFoundError(f"图片文件不存在: {image_path}")

            # 获取文件大小
            file_size = os.path.getsize(image_path) / 1024  # KB
            self._log_debug(f"文件大小: {file_size:.2f}KB")

            # 1. 文字检测
            from paddleocr import TextDetection
            det_model = TextDetection(model_name="PP-OCRv5_server_det")
            det_results = det_model.predict(image_path, batch_size=1)

            # 2. 格式化检测结果
            detection_result = self._format_ocr_result(det_results)

            if not detection_result['success']:
                self._log_warning("未检测到任何文字区域")
                return detection_result

            self._log_info(f"检测到 {detection_result['total_items']} 个文字区域")

            # 3. 获取最高置信度的检测框
            highest_confidence_box = self.get_highest_confidence_result(detection_result)
            print(f"最高置信度检测框: {highest_confidence_box}")

            # 4. 如果启用识别,对最高置信度区域进行文字识别
            recognition_result = None
            if recognize_best_box and highest_confidence_box:
                recognition_result = self._recognize_text_in_box(
                    image_path,
                    highest_confidence_box['bbox']
                )

            # 合并结果
            combined = self._combine_results(highest_confidence_box, recognition_result)

            # 5. 合并结果
            final_result = {
                'success': True,
                'detection': detection_result,
                'highest_confidence_box': highest_confidence_box,
                'recognition': recognition_result,
                'combined_result': combined,
                'timestamp': time.time(),
                'text': combined.get('text', '') if combined else ''
            }

            # 6. 更新统计信息
            self._update_stats(final_result, start_time, success=True)

            # 7. 保存结果
            if save_result:
                self._save_combined_result(image_path, final_result, output_dir)

            process_time = time.time() - start_time
            self._log_info(f"文件识别完成: {image_path}, 检测到 {detection_result['total_items']} 个区域, "
                           f"识别文字: {final_result['combined_result'].get('text', '无')}, "
                           f"耗时: {process_time:.2f}秒")

            return final_result

        except Exception as e:
            self._update_stats(None, start_time, success=False)
            self._log_error(f"文件识别失败: {image_path}, 错误: {str(e)}")
            raise Exception(f"OCR识别失败: {str(e)}")

    def recognize_from_image(self,
                             image: Union[np.ndarray, Image.Image],
                             save_result: bool = False,
                             output_dir: str = "./data/OCR_Results",
                             recognize_best_box:bool =True) -> Dict:
        """
        从内存中的图像识别文字

        Args:
            image: PIL.Image 或 numpy数组
            save_result: 是否保存结果
            output_dir: 结果保存目录

        Returns:
            识别结果字典
        """
        start_time = time.time()

        try:
            self._log_debug("开始识别内存图像")

            # 获取图像尺寸
            if isinstance(image, Image.Image):
                img_size = f"{image.width}x{image.height}"
                image_array = np.array(image)
            else:
                img_size = f"{image.shape[1]}x{image.shape[0]}"
                image_array = image

            self._log_debug(f"图像尺寸: {img_size}")

            # 1. 文字检测
            from paddleocr import TextDetection
            det_model = TextDetection(model_name="PP-OCRv5_server_det")
            det_results = det_model.predict(image_array, batch_size=1)

            # 2. 格式化检测结果
            detection_result = self._format_ocr_result(det_results)

            if not detection_result['success']:
                self._log_warning("未检测到任何文字区域")
                return detection_result

            self._log_info(f"检测到 {detection_result['total_items']} 个文字区域")

            # 3. 获取最高置信度的检测框
            highest_confidence_box = self.get_highest_confidence_result(detection_result)
            print(f"最高置信度检测框: {highest_confidence_box}")

            # 4. 如果启用识别,对最高置信度区域进行文字识别
            recognition_result = None
            if recognize_best_box and highest_confidence_box:
                recognition_result = self._recognize_text_in_box(
                    image_array,
                    highest_confidence_box['bbox']
                )

            # 合并结果
            combined = self._combine_results(highest_confidence_box, recognition_result)

            # 5. 合并结果
            final_result = {
                'success': True,
                'detection': detection_result,
                'highest_confidence_box': highest_confidence_box,
                'recognition': recognition_result,
                'combined_result': combined,
                'timestamp': time.time(),
                'text': combined.get('text', '') if combined else ''
            }

            # 6. 更新统计信息
            self._update_stats(final_result, start_time, success=True)

            # 7. 保存结果
            if save_result:
                self._save_combined_result(image_array, final_result, output_dir)

            process_time = time.time() - start_time
            self._log_info(f"文件识别完成: {image_array}, 检测到 {detection_result['total_items']} 个区域, "
                           f"识别文字: {final_result['combined_result'].get('text', '无')}, "
                           f"耗时: {process_time:.2f}秒")

            return final_result

        except Exception as e:
            self._update_stats(None, start_time, success=False)
            self._log_error(f"图像识别失败: {str(e)}")
            raise Exception(f"OCR识别失败: {str(e)}")

    def recognize_from_screen(self,
                              region: Optional[Tuple[int, int, int, int]] = None,
                              save_result: bool = False,
                              output_dir: str = "./data/OCR_Results") -> Dict:
        """
        从屏幕截图识别文字

        Args:
            region: 截图区域 (x1, y1, x2, y2),None表示全屏
            save_result: 是否保存结果
            output_dir: 结果保存目录

        Returns:
            识别结果字典
        """
        start_time = time.time()

        try:
            region_info = f"区域: {region}" if region else "全屏"
            self._log_debug(f"开始屏幕识别: {region_info}")

            # 截图
            if region:
                screenshot = ImageGrab.grab(bbox=region)
                self._log_debug(f"截取区域: {region}, 尺寸: {screenshot.width}x{screenshot.height}")
            else:
                screenshot = ImageGrab.grab()
                self._log_debug(f"全屏截图, 尺寸: {screenshot.width}x{screenshot.height}")

            # 识别
            result = self.recognize_from_image(screenshot, save_result, output_dir)

            process_time = time.time() - start_time
            self._log_info(f"屏幕识别完成: {region_info}, 识别到 {result['total_items']} 个项目, "
                           f"耗时: {process_time:.2f}秒")

            return result

        except Exception as e:
            self._log_error(f"屏幕OCR识别失败: {str(e)}")
            raise Exception(f"屏幕OCR识别失败: {str(e)}")

    def recognize_text_in_region(self,
                                 image_path: str,
                                 region: Tuple[int, int, int, int],
                                 save_result: bool = False) -> Dict:
        """
        识别图片中指定区域内的文字

        Args:
            image_path: 图片路径
            region: 区域坐标 (x1, y1, x2, y2)
            save_result: 是否保存结果

        Returns:
            识别结果字典
        """
        start_time = time.time()

        try:
            self._log_debug(f"开始区域识别: {image_path}, 区域: {region}")

            # 读取图片并裁剪区域
            image = Image.open(image_path)
            cropped_image = image.crop(region)

            self._log_debug(f"原始图片尺寸: {image.width}x{image.height}, "
                            f"裁剪区域尺寸: {cropped_image.width}x{cropped_image.height}")

            # 识别裁剪后的图片
            result = self.recognize_from_image(cropped_image, save_result)

            # 调整坐标(从裁剪区域坐标转换到原始图片坐标)
            x1, y1, x2, y2 = region
            for item in result['details']:
                # 调整边界框坐标
                bbox = item['bbox']
                adjusted_bbox = [
                    [bbox[0][0] + x1, bbox[0][1] + y1],
                    [bbox[1][0] + x1, bbox[1][1] + y1],
                    [bbox[2][0] + x1, bbox[2][1] + y1],
                    [bbox[3][0] + x1, bbox[3][1] + y1]
                ]
                item['bbox'] = adjusted_bbox

                # 调整中心点坐标
                item['center'] = (item['center'][0] + x1, item['center'][1] + y1)

            process_time = time.time() - start_time
            self._log_info(f"区域识别完成: {image_path}, 区域: {region}, "
                           f"识别到 {result['total_items']} 个项目, 耗时: {process_time:.2f}秒")

            return result

        except Exception as e:
            self._log_error(f"区域OCR识别失败: {image_path}, 区域: {region}, 错误: {str(e)}")
            raise Exception(f"区域OCR识别失败: {str(e)}")

    def search_text_on_screen(self,
                              text: str,
                              region: Optional[Tuple[int, int, int, int]] = None,
                              case_sensitive: bool = False,
                              confidence_threshold: float = 0.7) -> List[Dict]:
        """
        在屏幕上搜索特定文字

        Args:
            text: 要搜索的文字
            region: 搜索区域
            case_sensitive: 是否区分大小写
            confidence_threshold: 置信度阈值

        Returns:
            匹配结果列表
        """
        start_time = time.time()

        try:
            region_info = f"区域: {region}" if region else "全屏"
            search_info = f"搜索文字: '{text}', 区分大小写: {case_sensitive}, 置信度阈值: {confidence_threshold}"
            self._log_debug(f"开始文字搜索: {search_info}, {region_info}")

            # 识别屏幕文字
            ocr_result = self.recognize_from_screen(region)

            # 搜索匹配的文字
            matches = []
            search_text = text if case_sensitive else text.lower()

            for item in ocr_result['details']:
                item_text = item['text'] if case_sensitive else item['text'].lower()
                confidence = item['confidence']

                # 检查是否包含搜索文本且置信度达标
                if search_text in item_text and confidence >= confidence_threshold:
                    matches.append({
                        'text': item['text'],
                        'confidence': confidence,
                        'bbox': item['bbox'],
                        'center': item['center'],
                        'full_match': (search_text == item_text)
                    })

            process_time = time.time() - start_time
            self._log_info(f"文字搜索完成: 找到 {len(matches)} 个匹配项, "
                           f"耗时: {process_time:.2f}秒, 搜索条件: '{text}'")

            if matches:
                self._log_debug(f"匹配结果前3项: {matches[:3]}")

            return matches

        except Exception as e:
            self._log_error(f"文字搜索失败: {str(e)}")
            raise Exception(f"文字搜索失败: {str(e)}")

    def extract_text_only(self, image_source: Union[str, np.ndarray, Image.Image]) -> str:
        """
        仅提取文字内容(快速模式)

        Args:
            image_source: 图片路径、numpy数组或PIL Image

        Returns:
            提取的文字内容
        """
        start_time = time.time()

        try:
            source_type = type(image_source).__name__
            self._log_debug(f"开始快速提取文字, 源类型: {source_type}")

            if isinstance(image_source, str):
                self._log_debug(f"从文件提取: {image_source}")
                result = self.recognize_from_file(image_source, save_result=False)
            else:
                self._log_debug("从图像提取")
                result = self.recognize_from_image(image_source, save_result=False)

            # 提取所有文字
            texts = [item['text'] for item in result['details']]
            extracted_text = '\n'.join(texts)

            process_time = time.time() - start_time
            self._log_info(f"快速提取完成, 提取到 {len(texts)} 段文字, "
                           f"总字符数: {len(extracted_text)}, 耗时: {process_time:.2f}秒")

            return extracted_text

        except Exception as e:
            self._log_error(f"快速提取失败: {str(e)}")
            return ""

    def _recognize_text_in_box(self, image_path, bbox):
        """
        在指定边界框内进行文字识别

        Args:
            image_path: 原始图片路径
            bbox: 边界框坐标 [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]

        Returns:
            文字识别结果
        """
        try:
            import cv2
            import numpy as np
            from paddleocr import TextRecognition

            # 1. 读取原始图片
            img = cv2.imread(image_path)
            if img is None:
                self._log_error(f"无法读取图片: {image_path}")
                return None

            # 2. 提取边界框坐标
            x_coords = [point[0] for point in bbox]
            y_coords = [point[1] for point in bbox]

            x1, y1 = int(min(x_coords)), int(min(y_coords))
            x2, y2 = int(max(x_coords)), int(max(y_coords))

            # 添加一点边距(避免裁剪太紧)
            margin = 2
            x1, y1 = max(0, x1 - margin), max(0, y1 - margin)
            x2, y2 = min(img.shape[1], x2 + margin), min(img.shape[0], y2 + margin)

            # 3. 裁剪区域
            cropped_img = img[y1:y2, x1:x2]

            if cropped_img.size == 0:
                self._log_warning("裁剪区域为空")
                return None

            # 4. 保存为临时文件(TextRecognition需要文件路径)
            import tempfile
            import os

            temp_dir = tempfile.mkdtemp()
            temp_path = os.path.join(temp_dir, "cropped_text.png")
            cv2.imwrite(temp_path, cropped_img)

            self._log_debug(f"裁剪区域保存到: {temp_path}, 尺寸: {cropped_img.shape[1]}x{cropped_img.shape[0]}")

            # 5. 文字识别
            rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
            rec_results = rec_model.predict(temp_path, batch_size=1)

            # 6. 格式化识别结果
            recognition_result = self._format_ocr_result(rec_results)

            # 7. 清理临时文件
            try:
                os.remove(temp_path)
                os.rmdir(temp_dir)
            except:
                pass

            self._log_info(f"区域识别完成: 位置({x1},{y1})-({x2},{y2}), "
                           f"识别到文字: {recognition_result.get('text', '无')}")

            return recognition_result

        except Exception as e:
            self._log_error(f"区域文字识别失败: {e}")
            return None

    def _combine_results(self, detection_box, recognition_result):
        """
        合并检测和识别结果

        Args:
            detection_box: 检测框信息
            recognition_result: 文字识别结果

        Returns:
            合并后的结果
        """
        combined = {
            'detection_confidence': detection_box.get('confidence', 0.0),
            'bbox': detection_box.get('bbox', []),
            'center': detection_box.get('center', (0, 0)),
            'bbox_width': detection_box.get('bbox_width', 0),
            'bbox_height': detection_box.get('bbox_height', 0)
        }

        if recognition_result and recognition_result.get('success', False):
            # 获取识别结果中的最高置信度文字
            rec_details = recognition_result.get('details', [])
            if rec_details:
                best_recognition = max(rec_details, key=lambda x: x.get('confidence', 0))
                combined.update({
                    'text': best_recognition.get('text', ''),
                    'recognition_confidence': best_recognition.get('confidence', 0.0),
                    'recognition_details': recognition_result
                })
            else:
                combined['text'] = recognition_result.get('text', '')
                combined['recognition_confidence'] = 0.0
        else:
            combined['text'] = '未识别到文字'
            combined['recognition_confidence'] = 0.0
            combined['recognition_error'] = recognition_result.get('error',
                                                                   '识别失败') if recognition_result else '无识别结果'

        # 计算综合置信度
        combined['combined_confidence'] = (
                combined['detection_confidence'] * 0.4 +
                combined['recognition_confidence'] * 0.6
        )

        return combined

    def _save_combined_result(self, image_path, combined_result, output_dir):
        """
        保存合并的识别结果
        """
        try:
            os.makedirs(output_dir, exist_ok=True)

            # 生成文件名
            timestamp = int(time.time())
            basename = os.path.basename(image_path)
            name_without_ext = os.path.splitext(basename)[0]

            # 保存JSON结果
            json_path = os.path.join(output_dir, f"{name_without_ext}_combined_{timestamp}.json")
            import json
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(combined_result, f, ensure_ascii=False, indent=2)

            # 保存文本结果(主要识别内容)
            txt_path = os.path.join(output_dir, f"{name_without_ext}_text_{timestamp}.txt")
            with open(txt_path, 'w', encoding='utf-8') as f:
                combined_text = combined_result.get('combined_result', {}).get('text', '')
                f.write(combined_text)

            # 保存可视化结果
            self._visualize_detection_and_recognition(
                image_path,
                combined_result,
                os.path.join(output_dir, f"{name_without_ext}_visual_{timestamp}.png")
            )

            self._log_info(f"合并结果已保存到: {json_path}, {txt_path}")

        except Exception as e:
            self._log_error(f"保存合并结果失败: {str(e)}")

    def _visualize_detection_and_recognition(self, image_path, result, save_path):
        """
        可视化检测和识别结果
        """
        try:
            import cv2
            import numpy as np

            img = cv2.imread(image_path)
            if img is None:
                return

            # 获取检测框
            detection_result = result.get('detection', {})
            detection_details = detection_result.get('details', [])

            # 绘制所有检测框
            for item in detection_details:
                bbox = item.get('bbox', [])
                if bbox and len(bbox) == 4:
                    pts = np.array(bbox, dtype=np.int32)
                    # 低置信度用黄色,高置信度用绿色
                    confidence = item.get('confidence', 0)
                    color = (0, 255, 0) if confidence > 0.8 else (0, 165, 255)
                    cv2.polylines(img, [pts], isClosed=True, color=color, thickness=1)

            # 高亮显示最高置信度检测框
            highest_box = result.get('highest_confidence_box', {})
            if highest_box and 'bbox' in highest_box:
                bbox = highest_box['bbox']
                pts = np.array(bbox, dtype=np.int32)
                cv2.polylines(img, [pts], isClosed=True, color=(0, 0, 255), thickness=3)

                # 添加识别文字
                text = result.get('combined_result', {}).get('text', '')
                if text:
                    # 在框上方添加文字
                    x_coords = [p[0] for p in bbox]
                    y_coords = [p[1] for p in bbox]
                    x_center = int((min(x_coords) + max(x_coords)) / 2)
                    y_top = min(y_coords) - 10

                    # ====== 使用PIL绘制中文文字 ======
                    # 1. 将OpenCV图像转换为PIL格式
                    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                    img_pil = Image.fromarray(img_rgb)
                    draw = ImageDraw.Draw(img_pil)

                    # 2. 加载中文字体(最简单:使用系统字体)
                    try:
                        # Windows字体路径
                        font_path = "C:/Windows/Fonts/simhei.ttf"  # 黑体
                        if os.path.exists(font_path):
                            font = ImageFont.truetype(font_path, 20)
                        else:
                            # 如果找不到字体,使用默认字体(可能显示方框)
                            font = ImageFont.load_default()
                    except:
                        font = ImageFont.load_default()

                    # 3. 获取文字尺寸
                    try:
                        # Pillow 10.0.0+
                        text_bbox = draw.textbbox((0, 0), text, font=font)
                        text_width = text_bbox[2] - text_bbox[0]
                        text_height = text_bbox[3] - text_bbox[1]
                    except:
                        # Pillow 9.x
                        text_width, text_height = draw.textsize(text, font=font)

                    # 4. 绘制文字背景(用PIL画矩形)
                    draw.rectangle([
                        x_center - text_width // 2 - 5,
                        y_top - text_height - 5,
                        x_center + text_width // 2 + 5,
                        y_top + 5
                    ], fill=(255, 0, 0))  # RGB红色

                    # 5. 绘制中文文字
                    draw.text((x_center - text_width // 2, y_top - text_height),
                              text, font=font, fill=(255, 255, 255))

                    # 6. 转换回OpenCV格式
                    img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

            cv2.imwrite(save_path, img)
            self._log_debug(f"可视化结果已保存: {save_path}")

        except Exception as e:
            self._log_error(f"可视化失败: {e}")

    def get_highest_confidence_result(self, formatted_result):
        """
        从格式化结果中提取置信度最高的内容

        Args:
            formatted_result: 格式化后的OCR结果

        Returns:
            最高置信度的内容字典
        """
        if not formatted_result['success'] or formatted_result['total_items'] == 0:
            return None

        # 从details中找到置信度最高的项
        details = formatted_result['details']
        highest_item = max(details, key=lambda x: x['confidence'])

        return {
            'text': highest_item['text'],
            'confidence': highest_item['confidence'],
            'bbox': highest_item['bbox'],
            'center': highest_item['center'],
            'bbox_width': highest_item['bbox_width'],
            'bbox_height': highest_item['bbox_height']
        }

    def _format_detection_result(self, raw_results) -> Dict:
        """
        格式化文字检测原始结果 - 适配TextDetection.predict()的输出

        Args:
            raw_results: TextDetection.predict()返回的字典列表

        Returns:
            格式化后的结果字典,包含所有检测框的位置和置信度
        """
        details = []

        try:
            # raw_results 是一个列表,第一个元素包含检测信息
            if isinstance(raw_results, list) and len(raw_results) > 0:
                detection_info = raw_results[0]

                self._log_debug(f"检测信息键: {detection_info.keys()}")

                # 提取检测框和置信度
                if 'dt_polys' in detection_info and 'dt_scores' in detection_info:
                    dt_polys = detection_info['dt_polys']  # 形状: (n, 4, 2)
                    dt_scores = detection_info['dt_scores']  # 长度: n

                    self._log_info(f"检测到 {len(dt_polys)} 个文字区域")

                    # 处理每个检测框
                    for i in range(len(dt_polys)):
                        try:
                            # 获取多边形坐标 (4个点,每个点有x,y)
                            polygon = dt_polys[i]

                            # 转换为标准的bbox格式 [左上, 右上, 右下, 左下]
                            bbox = []
                            for point in polygon:
                                bbox.append([int(point[0]), int(point[1])])

                            # 计算边界框的几何属性
                            x_coords = [p[0] for p in bbox]
                            y_coords = [p[1] for p in bbox]

                            bbox_width = max(x_coords) - min(x_coords)
                            bbox_height = max(y_coords) - min(y_coords)
                            center_x = (min(x_coords) + max(x_coords)) / 2
                            center_y = (min(y_coords) + max(y_coords)) / 2

                            confidence = float(dt_scores[i]) if i < len(dt_scores) else 0.0

                            # 创建检测框项
                            item = {
                                'text': f"检测框_{i + 1}",  # 文字检测不提供文字内容
                                'confidence': confidence,
                                'bbox': bbox,  # 真实的边界框坐标
                                'center': (center_x, center_y),
                                'bbox_width': bbox_width,
                                'bbox_height': bbox_height,
                                'box_index': i,
                                'polygon': polygon.tolist() if hasattr(polygon, 'tolist') else polygon,
                                'type': 'text_detection'
                            }

                            details.append(item)

                            self._log_debug(f"检测框{i + 1}: 置信度={confidence:.4f}, "
                                            f"位置=({min(x_coords)},{min(y_coords)})-"
                                            f"({max(x_coords)},{max(y_coords)})")

                        except Exception as e:
                            self._log_error(f"处理检测框{i}时出错: {e}")

                    if details:
                        confidences = [item['confidence'] for item in details]
                        self._log_info(f"检测完成: {len(details)} 个区域, "
                                       f"置信度范围: {min(confidences):.4f}-{max(confidences):.4f}")

        except Exception as e:
            self._log_error(f"格式化检测结果时发生错误: {e}")
            return {
                'success': False,
                'total_items': 0,
                'text': '',
                'details': [],
                'timestamp': time.time(),
                'error': str(e)
            }

        # 创建最终结果字典
        result = {
            'success': len(details) > 0,
            'total_items': len(details),
            'text': f"检测到 {len(details)} 个文字区域",
            'details': details,
            'timestamp': time.time(),
            'result_type': 'text_detection'
        }

        if not result['success']:
            self._log_warning("未检测到任何文字区域")

        return result

    def _format_ocr_result(self, raw_results) -> Dict:
        """
        智能格式化OCR结果 - 根据结果类型自动选择解析方式

        注意:这个函数现在会检测结果是识别结果还是检测结果
        """
        # 先尝试判断结果类型
        if self._is_detection_result(raw_results):
            self._log_info("检测到文字检测结果,使用检测解析器")
            return self._format_detection_result(raw_results)
        else:
            self._log_info("检测到文字识别结果,使用识别解析器")
            return self._format_recognition_result(raw_results)

    def _is_detection_result(self, raw_results):
        """
        判断是否为文字检测结果

        检测结果的判断依据:
        1. 包含'dt_polys'键
        2. 包含'dt_scores'键
        """
        try:
            if isinstance(raw_results, list) and len(raw_results) > 0:
                first_item = raw_results[0]
                if isinstance(first_item, dict):
                    return 'dt_polys' in first_item and 'dt_scores' in first_item
        except:
            pass
        return False

    def _format_recognition_result(self, raw_results) -> Dict:
        """
        格式化文字识别结果 - 这是你原来的识别解析函数
        重命名以区分检测和识别
        """
        # 这是你原来的_format_ocr_result函数的代码,只是改了名字
        details = []
        all_text = []

        # 调试信息:显示原始结果类型
        self._log_debug(f"原始结果类型: {type(raw_results)}")

        try:
            # 情况1:如果raw_results是列表
            if isinstance(raw_results, list):
                for i, result in enumerate(raw_results):
                    try:
                        # 调试:打印每个结果的详细信息
                        self._log_debug(f"结果[{i}]: {result}")

                        # 提取文字和置信度
                        text, confidence = self._extract_from_result_dict(result)

                        if text:
                            # 创建结果项 - 对于纯文本识别,没有bbox信息
                            item = {
                                'text': text,
                                'confidence': confidence,
                                'bbox': [[0, 0], [100, 0], [100, 50], [0, 50]],
                                'center': (50, 25),
                                'bbox_width': 100,
                                'bbox_height': 50,
                                'item_index': i
                            }

                            details.append(item)
                            all_text.append(text)

                            self._log_debug(f"识别到第{i + 1}项: '{text}' (置信度: {confidence:.4f})")

                    except Exception as e:
                        self._log_error(f"处理第{i + 1}个结果时出错: {e}")

            # 情况2:如果raw_results是字典
            elif isinstance(raw_results, dict):
                text, confidence = self._extract_from_result_dict(raw_results)
                if text:
                    item = {
                        'text': text,
                        'confidence': confidence,
                        'bbox': [[0, 0], [100, 0], [100, 50], [0, 50]],
                        'center': (50, 25),
                        'bbox_width': 100,
                        'bbox_height': 50,
                        'item_index': 0
                    }
                    details.append(item)
                    all_text.append(text)

            # 情况3:其他格式
            else:
                self._log_warning(f"无法识别的结果格式: {type(raw_results)}")

                # 尝试转换为字符串处理
                try:
                    result_str = str(raw_results)
                    if result_str and result_str.strip():
                        item = {
                            'text': result_str.strip(),
                            'confidence': 1.0,
                            'bbox': [[0, 0], [100, 0], [100, 50], [0, 50]],
                            'center': (50, 25),
                            'bbox_width': 100,
                            'bbox_height': 50,
                            'item_index': 0
                        }
                        details.append(item)
                        all_text.append(result_str.strip())
                except:
                    pass

        except Exception as e:
            self._log_error(f"格式化结果时发生错误: {e}")
            return {
                'success': False,
                'total_items': 0,
                'text': '',
                'details': [],
                'timestamp': time.time(),
                'error': str(e)
            }

        # 创建最终结果字典
        result = {
            'success': len(details) > 0,
            'total_items': len(details),
            'text': '\n'.join(all_text),
            'details': details,
            'timestamp': time.time(),
            'result_type': 'text_recognition'
        }

        if result['success']:
            # 计算置信度统计
            confidences = [item['confidence'] for item in details]
            if confidences:
                self._log_info(f"识别完成: {len(details)} 个项目, "
                               f"置信度范围: {min(confidences):.4f}-{max(confidences):.4f}, "
                               f"平均: {np.mean(confidences):.4f}")
        else:
            self._log_warning("未识别到任何文字")
            if raw_results is not None:
                self._log_debug(f"原始结果内容: {raw_results}")

        return result

    def _extract_from_result_dict(self, result_dict):
        """
        从字典结果中提取文字和置信度

        Args:
            result_dict: 包含识别结果的字典

        Returns:
            (text, confidence) 元组
        """
        text = ""
        confidence = 0.0

        try:
            # 调试:打印字典结构
            self._log_debug(f"解析字典: {result_dict}")

            # 情况1:字典包含'res'键(TextRecognition的标准输出格式)
            if 'res' in result_dict:
                res_data = result_dict['res']
                self._log_debug(f"res内容: {res_data}")

                if isinstance(res_data, dict):
                    # 提取文字
                    if 'rec_text' in res_data:
                        text = res_data['rec_text']
                    elif 'text' in res_data:
                        text = res_data['text']
                    elif 'ocr_text' in res_data:
                        text = res_data['ocr_text']
                    elif 'result' in res_data:
                        text = res_data['result']

                    # 提取置信度
                    if 'rec_score' in res_data:
                        confidence = float(res_data['rec_score'])
                    elif 'score' in res_data:
                        confidence = float(res_data['score'])
                    elif 'confidence' in res_data:
                        confidence = float(res_data['confidence'])
                    elif 'prob' in res_data:
                        confidence = float(res_data['prob'])

            # 情况2:字典直接包含文本和置信度
            elif 'text' in result_dict:
                text = result_dict['text']
                confidence = result_dict.get('confidence',
                                             result_dict.get('score',
                                                             result_dict.get('rec_score', 0.0)))

            # 情况3:尝试查找所有可能的键
            else:
                # 搜索可能的文本键
                for key in ['rec_text', 'text', 'ocr_text', 'result', 'content', 'words']:
                    if key in result_dict:
                        value = result_dict[key]
                        if isinstance(value, str):
                            text = value
                        elif isinstance(value, list) and len(value) > 0:
                            # 如果是列表,取第一个元素
                            text = str(value[0])
                        else:
                            text = str(value)
                        break

                # 搜索可能的置信度键
                for key in ['rec_score', 'score', 'confidence', 'prob', 'probability']:
                    if key in result_dict:
                        try:
                            confidence = float(result_dict[key])
                            break
                        except (ValueError, TypeError):
                            continue

            # 情况4:如果还没找到,尝试递归搜索嵌套字典
            if not text or text.strip() == "":
                text = self._find_text_recursively(result_dict)

            # 清理文本
            if text:
                text = text.strip()
                # 移除可能的JSON格式字符串中的引号
                if text.startswith('"') and text.endswith('"'):
                    text = text[1:-1]
                elif text.startswith("'") and text.endswith("'"):
                    text = text[1:-1]

            # 如果text是数字,转换为字符串
            if isinstance(text, (int, float)):
                text = str(text)

            # 确保confidence是浮点数且在0-1之间
            if confidence is None:
                confidence = 0.0
            else:
                try:
                    confidence = float(confidence)
                    # 如果置信度大于1,可能是百分比,转换为小数
                    if confidence > 1.0:
                        confidence = confidence / 100.0
                    # 确保在合理范围内
                    confidence = max(0.0, min(1.0, confidence))
                except (ValueError, TypeError):
                    confidence = 0.0

            # 如果text为空但字典有其他信息,返回JSON字符串
            if not text and result_dict:
                import json
                try:
                    text = json.dumps(result_dict, ensure_ascii=False, indent=None)
                    # 如果JSON太长,截断
                    if len(text) > 100:
                        text = text[:100] + "..."
                    confidence = 0.5
                except:
                    text = str(result_dict)

            self._log_debug(f"提取结果: text='{text}', confidence={confidence:.4f}")

        except Exception as e:
            self._log_error(f"提取字典结果时出错: {e}")
            text = str(result_dict)
            confidence = 0.0

        return text, confidence

    def _find_text_recursively(self, data, max_depth=3, current_depth=0):
        """
        递归查找字典或列表中的文本

        Args:
            data: 要搜索的数据
            max_depth: 最大递归深度
            current_depth: 当前递归深度

        Returns:
            找到的文本或空字符串
        """
        if current_depth >= max_depth:
            return ""

        try:
            # 如果是字典
            if isinstance(data, dict):
                # 首先检查常见的文本键
                for key in ['rec_text', 'text', 'result', 'content', 'words', 'ocr_text']:
                    if key in data:
                        value = data[key]
                        if isinstance(value, str) and value.strip():
                            return value.strip()
                        elif isinstance(value, list) and len(value) > 0:
                            # 如果是字符串列表,合并它们
                            if all(isinstance(item, str) for item in value):
                                return ' '.join(item.strip() for item in value if item.strip())
                            else:
                                return str(value[0])
                        elif value is not None:
                            return str(value).strip()

                # 递归检查所有值
                for key, value in data.items():
                    # 跳过某些键以加快搜索
                    if key in ['dt_polys', 'dt_scores', 'input_path', 'page_index', 'input_img']:
                        continue

                    result = self._find_text_recursively(value, max_depth, current_depth + 1)
                    if result:
                        return result

            # 如果是列表
            elif isinstance(data, list):
                for item in data:
                    result = self._find_text_recursively(item, max_depth, current_depth + 1)
                    if result:
                        return result

            # 如果是字符串
            elif isinstance(data, str) and data.strip():
                return data.strip()

            # 如果是其他类型,转换为字符串
            elif data is not None:
                return str(data).strip()

        except Exception as e:
            self._log_debug(f"递归查找文本时出错: {e}")

        return ""

    def _get_confidence_range(self, result: Dict) -> str:
        """获取置信度范围字符串"""
        if not result['details']:
            return "无结果"

        confidences = [item['confidence'] for item in result['details']]
        return f"{min(confidences):.2f}-{max(confidences):.2f}"

    def _save_ocr_result(self, image_path: str, result: Dict, output_dir: str):
        """
        保存OCR结果到文件

        Args:
            image_path: 原始图片路径
            result: OCR结果
            output_dir: 输出目录
        """
        try:
            # 创建输出目录
            os.makedirs(output_dir, exist_ok=True)

            # 生成文件名
            timestamp = int(time.time())
            basename = os.path.basename(image_path)
            name_without_ext = os.path.splitext(basename)[0]

            # 保存JSON结果
            json_path = os.path.join(output_dir, f"{name_without_ext}_ocr_{timestamp}.json")
            import json
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(result, f, ensure_ascii=False, indent=2)

            # 保存文本结果
            txt_path = os.path.join(output_dir, f"{name_without_ext}_text_{timestamp}.txt")
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(result['text'])

            self._log_info(f"OCR结果已保存到: {json_path}, {txt_path}")

        except Exception as e:
            self._log_error(f"保存OCR结果失败: {str(e)}")

    def _update_stats(self, result: Dict, start_time: float, success: bool = True):
        """更新性能统计"""
        process_time = time.time() - start_time

        self.stats['total_requests'] += 1
        self.stats['last_operation_time'] = process_time

        if success:
            self.stats['success_count'] += 1
            if result:
                self.stats['total_characters'] += len(result['text'])
        else:
            self.stats['error_count'] += 1

        # 计算平均处理时间
        successful_requests = self.stats['success_count']
        if successful_requests == 1:
            self.stats['avg_process_time'] = process_time
        elif successful_requests > 1:
            self.stats['avg_process_time'] = (self.stats['avg_process_time'] * (
                    successful_requests - 1) + process_time) / successful_requests

        # 每10次请求记录一次统计信息
        if self.stats['total_requests'] % 10 == 0:
            self._log_stats()

    def _log_stats(self):
        """记录统计信息"""
        stats_info = (
            f"OCR统计: 总请求数={self.stats['total_requests']}, "
            f"成功={self.stats['success_count']}, 失败={self.stats['error_count']}, "
            f"平均耗时={self.stats['avg_process_time']:.2f}秒, "
            f"总字符数={self.stats['total_characters']}"
        )
        self._log_info(stats_info)

    def _log_debug(self, message: str):
        """记录调试日志"""
        if self.logger:
            self.logger.debug(f"[OCRTool] {message}")
        elif self.show_log:
            print(f"[DEBUG] [OCRTool] {message}")

    def _log_info(self, message: str):
        """记录信息日志"""
        if self.logger:
            self.logger.info(f"[OCRTool] {message}")
        elif self.show_log:
            print(f"[INFO] [OCRTool] {message}")

    def _log_warning(self, message: str):
        """记录警告日志"""
        if self.logger:
            self.logger.warning(f"[OCRTool] {message}")
        else:
            print(f"[WARNING] [OCRTool] {message}")

    def _log_error(self, message: str):
        """记录错误日志"""
        if self.logger:
            self.logger.error(f"[OCRTool] {message}")
        else:
            print(f"[ERROR] [OCRTool] {message}")

    def get_statistics(self) -> Dict:
        """获取统计信息"""
        return self.stats.copy()

    def reset_statistics(self):
        """重置统计信息"""
        self.stats = {
            'total_requests': 0,
            'total_characters': 0,
            'avg_process_time': 0,
            'last_operation_time': None,
            'success_count': 0,
            'error_count': 0
        }
        self._log_info("统计信息已重置")

代码有些冗余还没有修改,目前对外主要调用的recognize_from_file函数和recognize_from_screen函数,其他还没有测试。

总结

后续,计划是想先让助手先学会打字,之前的一些问题结合OCR能得到解决,但是我突然又在想为什么不直接复制文字,等我再琢磨琢磨,打字实现起来有点麻烦,但是更加符合人对于电脑的控制。

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐