Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,22 @@ from PIL import Image
|
|
| 8 |
import fitz # PyMuPDF
|
| 9 |
import numpy as np
|
| 10 |
from transformers import NougatProcessor, VisionEncoderDecoderModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Set environment variables
|
| 13 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
@@ -90,6 +106,20 @@ def extract_text_from_pdf(pdf_bytes):
|
|
| 90 |
return default_paper_content
|
| 91 |
|
| 92 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# Load Nougat model
|
| 94 |
processor, model = load_nougat_model()
|
| 95 |
|
|
|
|
| 8 |
import fitz # PyMuPDF
|
| 9 |
import numpy as np
|
| 10 |
from transformers import NougatProcessor, VisionEncoderDecoderModel
|
| 11 |
+
import nltk
|
| 12 |
+
import ssl
|
| 13 |
+
|
| 14 |
+
# 初始化NLTK
|
| 15 |
+
try:
|
| 16 |
+
_create_unverified_https_context = ssl._create_unverified_context
|
| 17 |
+
except AttributeError:
|
| 18 |
+
pass
|
| 19 |
+
else:
|
| 20 |
+
ssl._create_default_https_context = _create_unverified_https_context
|
| 21 |
+
|
| 22 |
+
# 下载NLTK必要的数据
|
| 23 |
+
try:
|
| 24 |
+
nltk.data.find('tokenizers/punkt')
|
| 25 |
+
except LookupError:
|
| 26 |
+
nltk.download('punkt')
|
| 27 |
|
| 28 |
# Set environment variables
|
| 29 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
|
| 106 |
return default_paper_content
|
| 107 |
|
| 108 |
try:
|
| 109 |
+
# 确保NLTK已安装
|
| 110 |
+
try:
|
| 111 |
+
import nltk
|
| 112 |
+
try:
|
| 113 |
+
nltk.data.find('tokenizers/punkt')
|
| 114 |
+
except LookupError:
|
| 115 |
+
nltk.download('punkt')
|
| 116 |
+
except ImportError:
|
| 117 |
+
print("Installing NLTK...")
|
| 118 |
+
import subprocess
|
| 119 |
+
subprocess.check_call(["pip", "install", "nltk", "python-Levenshtein"])
|
| 120 |
+
import nltk
|
| 121 |
+
nltk.download('punkt')
|
| 122 |
+
|
| 123 |
# Load Nougat model
|
| 124 |
processor, model = load_nougat_model()
|
| 125 |
|