Spaces:

SynaptechX
/

ImgTextParser

Running on Zero

App Files Files Community

nihuajian commited on Aug 21

Commit

4ce6318

verified ·

1 Parent(s): 8b6067a

Create app.py

Browse files

Files changed (1) hide show

app.py +302 -0

app.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+import warnings
+import os
+# 禁用警告信息
+warnings.filterwarnings("ignore")
+# 全局变量存储模型
+model = None
+tokenizer = None
+def load_model():
+    """加载MiniCPM-o模型"""
+    global model, tokenizer
+    if model is None:
+        print("正在加载MiniCPM-o模型...")
+        model = AutoModel.from_pretrained(
+            'openbmb/MiniCPM-o-2_6',
+            trust_remote_code=True,
+            attn_implementation='sdpa',
+            torch_dtype=torch.bfloat16,
+            init_vision=True,
+            init_audio=False,
+            init_tts=False
+        )
+        model = model.eval().cuda()
+        tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+        print("模型加载完成")
+    return model, tokenizer
+def clean_markdown_output(text):
+    """清理输出文本，只保留markdown表格"""
+    lines = text.strip().split('\n')
+    markdown_lines = []
+    # 查找markdown表格的开始和结束
+    in_table = False
+    for line in lines:
+        line = line.strip()
+        # 检查是否是表格行（包含|符号）
+        if '|' in line and not line.startswith('```'):
+            in_table = True
+            markdown_lines.append(line)
+        elif in_table and line == '':
+            # 空行可能表示表格结束
+            break
+        elif in_table and not line.startswith('```'):
+            # 继续收集表格相关行
+            markdown_lines.append(line)
+    # 如果没有找到表格，返回原始清理后的文本
+    if not markdown_lines:
+        # 移除代码块标记和多余的说明文字
+        cleaned_text = text.replace('```markdown', '').replace('```', '').strip()
+        # 移除常见的解释性文字
+        lines = cleaned_text.split('\n')
+        result_lines = []
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith('这个表格') and not line.startswith('该表格') and not line.startswith('表格显示'):
+                result_lines.append(line)
+        return '\n'.join(result_lines)
+    return '\n'.join(markdown_lines)
+def clean_formula_output(text):
+    """清理输出文本，只保留LaTeX公式"""
+    lines = text.strip().split('\n')
+    formula_lines = []
+    for line in lines:
+        line = line.strip()
+        # 跳过解释性文字
+        if line and not any(line.startswith(prefix) for prefix in [
+            '这个公式', '该公式', '公式表示', '根据图片', '图片中的', '识别结果'
+        ]):
+            # 保留包含LaTeX语法的行
+            if any(symbol in line for symbol in ['$', '\\', '{', '}', '^', '_']) or '=' in line:
+                formula_lines.append(line)
+            # 或者保留纯数学表达式
+            elif any(char.isdigit() or char in '+-*/=()[]{}^_' for char in line):
+                formula_lines.append(line)
+    # 如果没有找到公式，返回原始清理后的文本
+    if not formula_lines:
+        cleaned_text = text.replace('```latex', '').replace('```', '').strip()
+        lines = cleaned_text.split('\n')
+        result_lines = []
+        for line in lines:
+            line = line.strip()
+            if line and not any(line.startswith(prefix) for prefix in [
+                '这个公式', '该公式', '公式表示', '根据图片', '图片中的'
+            ]):
+                result_lines.append(line)
+        return '\n'.join(result_lines)
+    return '\n'.join(formula_lines)
+def clean_text_output(text):
+    """清理输出文本，只保留识别的文字内容"""
+    lines = text.strip().split('\n')
+    text_lines = []
+    # 移除代码块标记
+    cleaned_text = text.replace('```text', '').replace('```', '').strip()
+    lines = cleaned_text.split('\n')
+    for line in lines:
+        line = line.strip()
+        # 跳过解释性文字
+        if line and not any(line.startswith(prefix) for prefix in [
+            '图片中的文字', '识别结果', '文字内容', '根据图片', '这张图片', '该图片'
+        ]):
+            text_lines.append(line)
+    return '\n'.join(text_lines)
+def parse_image(image, parse_type):
+    """解析图片内容为指定格式"""
+    try:
+        # 确保模型已加载
+        model, tokenizer = load_model()
+        if image is None:
+            return "请上传一张图片", ""
+        # 转换图片格式
+        if isinstance(image, str):
+            image = Image.open(image).convert('RGB')
+        elif hasattr(image, 'convert'):
+            image = image.convert('RGB')
+        # 根据解析类型设置不同的提示词
+        questions = {
+            "表格解析": "解析一下这个表格为markdown格式,不需要任何解释和思考,直接输出markdown格式",
+            "公式解析": "识别并提取���片中的数学公式，用LaTeX格式输出，不需要任何解释，直接输出公式",
+            "文本解析": "识别并提取图片中的所有文字内容，保持原有格式，不需要任何解释，直接输出文字内容"
+        }
+        question = questions.get(parse_type, questions["表格解析"])
+        msgs = [{'role': 'user', 'content': [image, question]}]
+        # 使用流式输出获取结果
+        res = model.chat(
+            msgs=msgs,
+            tokenizer=tokenizer,
+            sampling=True,
+            stream=True
+        )
+        # 收集所有输出文本
+        generated_text = ""
+        for new_text in res:
+            generated_text += new_text
+        # 根据类型清理输出
+        if parse_type == "表格解析":
+            result = clean_markdown_output(generated_text)
+            output_format = "Markdown"
+        elif parse_type == "公式解析":
+            result = clean_formula_output(generated_text)
+            output_format = "LaTeX"
+        elif parse_type == "文本解析":
+            result = clean_text_output(generated_text)
+            output_format = "纯文本"
+        else:
+            result = generated_text.strip()
+            output_format = "原始输出"
+        return result, f"解析完成 - 输出格式: {output_format}"
+    except Exception as e:
+        return f"解析失败: {str(e)}", "错误"
+def create_interface():
+    """创建Gradio界面"""
+    # 自定义CSS样式
+    css = """
+    .gradio-container {
+        font-family: 'Helvetica Neue', Arial, sans-serif;
+    }
+    .output-text {
+        font-family: 'Courier New', monospace;
+        font-size: 14px;
+    }
+    """
+    with gr.Blocks(css=css, title="MiniCPM 多模态内容解析工具") as interface:
+        gr.Markdown("""
+        # 🚀 MiniCPM 多模态内容解析工具
+        基于MiniCPM-o多模态模型的智能图片内容解析工具，支持表格、公式、文本三种解析模式。
+        ## 📋 使用说明
+        1. **上传图片**: 支持 PNG、JPG、JPEG 等格式
+        2. **选择解析类型**: 根据图片内容选择相应的解析模式
+        3. **获取结果**: 自动清理输出，获得纯净的解析结果
+        ## 🎯 解析类型说明
+        - **📊 表格解析**: 将表格图片转换为Markdown格式
+        - **🧮 公式解析**: 识别数学公式并输出LaTeX格式
+        - **📝 文本解析**: 提取图片中的所有文字内容
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # 输入组件
+                image_input = gr.Image(
+                    label="📷 上传图片",
+                    type="pil",
+                    height=400
+                )
+                parse_type = gr.Radio(
+                    choices=["表格解析", "公式解析", "文本解析"],
+                    value="表格解析",
+                    label="🎛️ 选择解析类型",
+                    info="根据图片内容选择合适的解析模式"
+                )
+                parse_button = gr.Button(
+                    "🔍 开始解析",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=1):
+                # 输出组件
+                status_output = gr.Textbox(
+                    label="📊 解析状态",
+                    value="等待上传图片...",
+                    interactive=False
+                )
+                result_output = gr.Textbox(
+                    label="📄 解析结果",
+                    lines=20,
+                    max_lines=30,
+                    show_copy_button=True,
+                    elem_classes=["output-text"],
+                    placeholder="解析结果将在这里显示..."
+                )
+        # 示例图片
+        gr.Markdown("## 📖 示例图片")
+        with gr.Row():
+            gr.Examples(
+                examples=[
+                    ["table.png", "表格解析"],
+                    ["formulas.png", "公式解析"],
+                    ["text.png", "文本解析"]
+                ],
+                inputs=[image_input, parse_type],
+                label="点击示例快速体验"
+            )
+        # 绑定事件
+        parse_button.click(
+            fn=parse_image,
+            inputs=[image_input, parse_type],
+            outputs=[result_output, status_output]
+        )
+        # 添加页脚信息
+        gr.Markdown("""
+        ---
+        ### 💡 使用提示
+        - 确保图片清晰，内容结构明显
+        - 复杂表格建议分段处理
+        - 公式图片建议使用高分辨率
+        - 文字图片避免模糊、倾斜或光线不足
+        ### 🔧 技术支持
+        - 模���: MiniCPM-o-2.6
+        - 框架: Gradio + Transformers
+        - GPU: CUDA加速推理
+        """)
+    return interface
+if __name__ == "__main__":
+    # 预加载模型（可选，在启动时加载以减少首次使用延迟）
+    try:
+        load_model()
+        print("✅ 模型预加载完成")
+    except Exception as e:
+        print(f"⚠️ 模型预加载失败: {e}")
+        print("模型将在首次使用时加载")
+    # 创建并启动界面
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0",  # 允许外部访问
+        server_port=7860,       # Hugging Face Spaces默认端口
+        share=False,            # 在Hugging Face上部署时设为False
+        show_error=True,        # 显示详细错误信息
+        quiet=False             # 显示启动信息
+    )