KevinHuSh
commited on
Commit
·
d54aa01
1
Parent(s):
7ba250b
Fit a lot of encodings for text file. (#458)
Browse files### What problem does this PR solve?
#384
### Type of change
- [x] Performance Improvement
- README.md +2 -2
- README_ja.md +2 -2
- README_zh.md +2 -2
- api/apps/api_app.py +1 -2
- api/apps/user_app.py +4 -1
- api/db/services/api_service.py +2 -2
- api/db/services/task_service.py +32 -21
- deepdoc/parser/excel_parser.py +4 -1
- rag/app/book.py +4 -2
- rag/app/laws.py +3 -2
- rag/app/naive.py +3 -5
- rag/app/one.py +3 -2
- rag/app/qa.py +3 -2
- rag/app/table.py +3 -2
- rag/nlp/__init__.py +29 -0
- rag/nlp/huqie.py +1 -0
- rag/nlp/search.py +14 -4
- rag/svr/task_broker.py +1 -0
- rag/svr/task_executor.py +5 -3
README.md
CHANGED
@@ -72,8 +72,8 @@
|
|
72 |
|
73 |
### 📝 Prerequisites
|
74 |
|
75 |
-
- CPU >=
|
76 |
-
- RAM >=
|
77 |
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
78 |
> If you have not installed Docker on your local machine (Windows, Mac, or Linux), see [Install Docker Engine](https://docs.docker.com/engine/install/).
|
79 |
|
|
|
72 |
|
73 |
### 📝 Prerequisites
|
74 |
|
75 |
+
- CPU >= 4 cores
|
76 |
+
- RAM >= 12 GB
|
77 |
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
78 |
> If you have not installed Docker on your local machine (Windows, Mac, or Linux), see [Install Docker Engine](https://docs.docker.com/engine/install/).
|
79 |
|
README_ja.md
CHANGED
@@ -72,8 +72,8 @@
|
|
72 |
|
73 |
### 📝 必要条件
|
74 |
|
75 |
-
- CPU >=
|
76 |
-
- RAM >=
|
77 |
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
78 |
> ローカルマシン(Windows、Mac、または Linux)に Docker をインストールしていない場合は、[Docker Engine のインストール](https://docs.docker.com/engine/install/) を参照してください。
|
79 |
|
|
|
72 |
|
73 |
### 📝 必要条件
|
74 |
|
75 |
+
- CPU >= 4 cores
|
76 |
+
- RAM >= 12 GB
|
77 |
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
78 |
> ローカルマシン(Windows、Mac、または Linux)に Docker をインストールしていない場合は、[Docker Engine のインストール](https://docs.docker.com/engine/install/) を参照してください。
|
79 |
|
README_zh.md
CHANGED
@@ -72,8 +72,8 @@
|
|
72 |
|
73 |
### 📝 前提条件
|
74 |
|
75 |
-
- CPU >=
|
76 |
-
- RAM >=
|
77 |
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
78 |
> 如果你并没有在本机安装 Docker(Windows、Mac,或者 Linux), 可以参考文档 [Install Docker Engine](https://docs.docker.com/engine/install/) 自行安装。
|
79 |
|
|
|
72 |
|
73 |
### 📝 前提条件
|
74 |
|
75 |
+
- CPU >= 4 核
|
76 |
+
- RAM >= 12 GB
|
77 |
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
78 |
> 如果你并没有在本机安装 Docker(Windows、Mac,或者 Linux), 可以参考文档 [Install Docker Engine](https://docs.docker.com/engine/install/) 自行安装。
|
79 |
|
api/apps/api_app.py
CHANGED
@@ -105,7 +105,7 @@ def stats():
|
|
105 |
res = {
|
106 |
"pv": [(o["dt"], o["pv"]) for o in objs],
|
107 |
"uv": [(o["dt"], o["uv"]) for o in objs],
|
108 |
-
"speed": [(o["dt"], float(o["tokens"])/float(o["duration"])) for o in objs],
|
109 |
"tokens": [(o["dt"], float(o["tokens"])/1000.) for o in objs],
|
110 |
"round": [(o["dt"], o["round"]) for o in objs],
|
111 |
"thumb_up": [(o["dt"], o["thumb_up"]) for o in objs]
|
@@ -176,7 +176,6 @@ def completion():
|
|
176 |
conv.reference.append(ans["reference"])
|
177 |
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
178 |
API4ConversationService.append_message(conv.id, conv.to_dict())
|
179 |
-
APITokenService.APITokenService(token)
|
180 |
return get_json_result(data=ans)
|
181 |
except Exception as e:
|
182 |
return server_error_response(e)
|
|
|
105 |
res = {
|
106 |
"pv": [(o["dt"], o["pv"]) for o in objs],
|
107 |
"uv": [(o["dt"], o["uv"]) for o in objs],
|
108 |
+
"speed": [(o["dt"], float(o["tokens"])/(float(o["duration"]+0.1))) for o in objs],
|
109 |
"tokens": [(o["dt"], float(o["tokens"])/1000.) for o in objs],
|
110 |
"round": [(o["dt"], o["round"]) for o in objs],
|
111 |
"thumb_up": [(o["dt"], o["thumb_up"]) for o in objs]
|
|
|
176 |
conv.reference.append(ans["reference"])
|
177 |
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
178 |
API4ConversationService.append_message(conv.id, conv.to_dict())
|
|
|
179 |
return get_json_result(data=ans)
|
180 |
except Exception as e:
|
181 |
return server_error_response(e)
|
api/apps/user_app.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import re
|
|
|
17 |
|
18 |
from flask import request, session, redirect
|
19 |
from werkzeug.security import generate_password_hash, check_password_hash
|
@@ -22,7 +23,7 @@ from flask_login import login_required, current_user, login_user, logout_user
|
|
22 |
from api.db.db_models import TenantLLM
|
23 |
from api.db.services.llm_service import TenantLLMService, LLMService
|
24 |
from api.utils.api_utils import server_error_response, validate_request
|
25 |
-
from api.utils import get_uuid, get_format_time, decrypt, download_img
|
26 |
from api.db import UserTenantRole, LLMType
|
27 |
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
28 |
LLM_FACTORY, LLM_BASE_URL
|
@@ -56,6 +57,8 @@ def login():
|
|
56 |
response_data = user.to_json()
|
57 |
user.access_token = get_uuid()
|
58 |
login_user(user)
|
|
|
|
|
59 |
user.save()
|
60 |
msg = "Welcome back!"
|
61 |
return cors_reponse(data=response_data, auth=user.get_id(), retmsg=msg)
|
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import re
|
17 |
+
from datetime import datetime
|
18 |
|
19 |
from flask import request, session, redirect
|
20 |
from werkzeug.security import generate_password_hash, check_password_hash
|
|
|
23 |
from api.db.db_models import TenantLLM
|
24 |
from api.db.services.llm_service import TenantLLMService, LLMService
|
25 |
from api.utils.api_utils import server_error_response, validate_request
|
26 |
+
from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
|
27 |
from api.db import UserTenantRole, LLMType
|
28 |
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
29 |
LLM_FACTORY, LLM_BASE_URL
|
|
|
57 |
response_data = user.to_json()
|
58 |
user.access_token = get_uuid()
|
59 |
login_user(user)
|
60 |
+
user.update_time = current_timestamp(),
|
61 |
+
user.update_date = datetime_format(datetime.now()),
|
62 |
user.save()
|
63 |
msg = "Welcome back!"
|
64 |
return cors_reponse(data=response_data, auth=user.get_id(), retmsg=msg)
|
api/db/services/api_service.py
CHANGED
@@ -40,8 +40,8 @@ class API4ConversationService(CommonService):
|
|
40 |
@classmethod
|
41 |
@DB.connection_context()
|
42 |
def append_message(cls, id, conversation):
|
43 |
-
cls.
|
44 |
-
return cls.model.update(round=cls.model.round + 1).where(id
|
45 |
|
46 |
@classmethod
|
47 |
@DB.connection_context()
|
|
|
40 |
@classmethod
|
41 |
@DB.connection_context()
|
42 |
def append_message(cls, id, conversation):
|
43 |
+
cls.update_by_id(id, conversation)
|
44 |
+
return cls.model.update(round=cls.model.round + 1).where(cls.model.id==id).execute()
|
45 |
|
46 |
@classmethod
|
47 |
@DB.connection_context()
|
api/db/services/task_service.py
CHANGED
@@ -13,6 +13,8 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
|
|
16 |
from peewee import Expression
|
17 |
from api.db.db_models import DB
|
18 |
from api.db import StatusEnum, FileType, TaskStatus
|
@@ -26,7 +28,7 @@ class TaskService(CommonService):
|
|
26 |
|
27 |
@classmethod
|
28 |
@DB.connection_context()
|
29 |
-
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=
|
30 |
fields = [
|
31 |
cls.model.id,
|
32 |
cls.model.doc_id,
|
@@ -45,20 +47,28 @@ class TaskService(CommonService):
|
|
45 |
Tenant.img2txt_id,
|
46 |
Tenant.asr_id,
|
47 |
cls.model.update_time]
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
@classmethod
|
64 |
@DB.connection_context()
|
@@ -74,9 +84,10 @@ class TaskService(CommonService):
|
|
74 |
@classmethod
|
75 |
@DB.connection_context()
|
76 |
def update_progress(cls, id, info):
|
77 |
-
|
78 |
-
|
79 |
-
cls.model.
|
80 |
-
|
81 |
-
|
82 |
-
cls.model.
|
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
+
import random
|
17 |
+
|
18 |
from peewee import Expression
|
19 |
from api.db.db_models import DB
|
20 |
from api.db import StatusEnum, FileType, TaskStatus
|
|
|
28 |
|
29 |
@classmethod
|
30 |
@DB.connection_context()
|
31 |
+
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
|
32 |
fields = [
|
33 |
cls.model.id,
|
34 |
cls.model.doc_id,
|
|
|
47 |
Tenant.img2txt_id,
|
48 |
Tenant.asr_id,
|
49 |
cls.model.update_time]
|
50 |
+
with DB.lock("get_task", -1):
|
51 |
+
docs = cls.model.select(*fields) \
|
52 |
+
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
53 |
+
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
54 |
+
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
55 |
+
.where(
|
56 |
+
Document.status == StatusEnum.VALID.value,
|
57 |
+
Document.run == TaskStatus.RUNNING.value,
|
58 |
+
~(Document.type == FileType.VIRTUAL.value),
|
59 |
+
cls.model.progress == 0,
|
60 |
+
#cls.model.update_time >= tm,
|
61 |
+
#(Expression(cls.model.create_time, "%%", comm) == mod)
|
62 |
+
)\
|
63 |
+
.order_by(cls.model.update_time.asc())\
|
64 |
+
.paginate(0, items_per_page)
|
65 |
+
docs = list(docs.dicts())
|
66 |
+
if not docs: return []
|
67 |
+
if not takeit: return docs
|
68 |
+
|
69 |
+
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
|
70 |
+
cls.model.id == docs[0]["id"]).execute()
|
71 |
+
return docs
|
72 |
|
73 |
@classmethod
|
74 |
@DB.connection_context()
|
|
|
84 |
@classmethod
|
85 |
@DB.connection_context()
|
86 |
def update_progress(cls, id, info):
|
87 |
+
with DB.lock("update_progress", -1):
|
88 |
+
if info["progress_msg"]:
|
89 |
+
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
|
90 |
+
cls.model.id == id).execute()
|
91 |
+
if "progress" in info:
|
92 |
+
cls.model.update(progress=info["progress"]).where(
|
93 |
+
cls.model.id == id).execute()
|
deepdoc/parser/excel_parser.py
CHANGED
@@ -3,6 +3,8 @@ from openpyxl import load_workbook
|
|
3 |
import sys
|
4 |
from io import BytesIO
|
5 |
|
|
|
|
|
6 |
|
7 |
class HuExcelParser:
|
8 |
def html(self, fnm):
|
@@ -66,7 +68,8 @@ class HuExcelParser:
|
|
66 |
return total
|
67 |
|
68 |
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
69 |
-
|
|
|
70 |
return len(txt.split("\n"))
|
71 |
|
72 |
|
|
|
3 |
import sys
|
4 |
from io import BytesIO
|
5 |
|
6 |
+
from rag.nlp import find_codec
|
7 |
+
|
8 |
|
9 |
class HuExcelParser:
|
10 |
def html(self, fnm):
|
|
|
68 |
return total
|
69 |
|
70 |
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
71 |
+
encoding = find_codec(binary)
|
72 |
+
txt = binary.decode(encoding)
|
73 |
return len(txt.split("\n"))
|
74 |
|
75 |
|
rag/app/book.py
CHANGED
@@ -15,7 +15,8 @@ import re
|
|
15 |
from io import BytesIO
|
16 |
|
17 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
18 |
-
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions,
|
|
|
19 |
from rag.nlp import huqie
|
20 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
21 |
|
@@ -87,7 +88,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
87 |
callback(0.1, "Start to parse.")
|
88 |
txt = ""
|
89 |
if binary:
|
90 |
-
|
|
|
91 |
else:
|
92 |
with open(filename, "r") as f:
|
93 |
while True:
|
|
|
15 |
from io import BytesIO
|
16 |
|
17 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
18 |
+
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
19 |
+
tokenize_chunks, find_codec
|
20 |
from rag.nlp import huqie
|
21 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
22 |
|
|
|
88 |
callback(0.1, "Start to parse.")
|
89 |
txt = ""
|
90 |
if binary:
|
91 |
+
encoding = find_codec(binary)
|
92 |
+
txt = binary.decode(encoding)
|
93 |
else:
|
94 |
with open(filename, "r") as f:
|
95 |
while True:
|
rag/app/laws.py
CHANGED
@@ -17,7 +17,7 @@ from docx import Document
|
|
17 |
|
18 |
from api.db import ParserType
|
19 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
20 |
-
make_colon_as_title, add_positions, tokenize_chunks
|
21 |
from rag.nlp import huqie
|
22 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
23 |
from rag.settings import cron_logger
|
@@ -111,7 +111,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
111 |
callback(0.1, "Start to parse.")
|
112 |
txt = ""
|
113 |
if binary:
|
114 |
-
|
|
|
115 |
else:
|
116 |
with open(filename, "r") as f:
|
117 |
while True:
|
|
|
17 |
|
18 |
from api.db import ParserType
|
19 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
20 |
+
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
21 |
from rag.nlp import huqie
|
22 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
23 |
from rag.settings import cron_logger
|
|
|
111 |
callback(0.1, "Start to parse.")
|
112 |
txt = ""
|
113 |
if binary:
|
114 |
+
encoding = find_codec(binary)
|
115 |
+
txt = binary.decode(encoding)
|
116 |
else:
|
117 |
with open(filename, "r") as f:
|
118 |
while True:
|
rag/app/naive.py
CHANGED
@@ -14,7 +14,7 @@ from io import BytesIO
|
|
14 |
from docx import Document
|
15 |
import re
|
16 |
from deepdoc.parser.pdf_parser import PlainParser
|
17 |
-
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
|
18 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
19 |
from rag.settings import cron_logger
|
20 |
|
@@ -139,10 +139,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
139 |
callback(0.1, "Start to parse.")
|
140 |
txt = ""
|
141 |
if binary:
|
142 |
-
|
143 |
-
|
144 |
-
except Exception as e:
|
145 |
-
txt = binary.decode("gb2312")
|
146 |
else:
|
147 |
with open(filename, "r") as f:
|
148 |
while True:
|
|
|
14 |
from docx import Document
|
15 |
import re
|
16 |
from deepdoc.parser.pdf_parser import PlainParser
|
17 |
+
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
18 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
19 |
from rag.settings import cron_logger
|
20 |
|
|
|
139 |
callback(0.1, "Start to parse.")
|
140 |
txt = ""
|
141 |
if binary:
|
142 |
+
encoding = find_codec(binary)
|
143 |
+
txt = binary.decode(encoding)
|
|
|
|
|
144 |
else:
|
145 |
with open(filename, "r") as f:
|
146 |
while True:
|
rag/app/one.py
CHANGED
@@ -12,7 +12,7 @@
|
|
12 |
#
|
13 |
import re
|
14 |
from rag.app import laws
|
15 |
-
from rag.nlp import huqie, tokenize
|
16 |
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
17 |
|
18 |
|
@@ -82,7 +82,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
82 |
callback(0.1, "Start to parse.")
|
83 |
txt = ""
|
84 |
if binary:
|
85 |
-
|
|
|
86 |
else:
|
87 |
with open(filename, "r") as f:
|
88 |
while True:
|
|
|
12 |
#
|
13 |
import re
|
14 |
from rag.app import laws
|
15 |
+
from rag.nlp import huqie, tokenize, find_codec
|
16 |
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
17 |
|
18 |
|
|
|
82 |
callback(0.1, "Start to parse.")
|
83 |
txt = ""
|
84 |
if binary:
|
85 |
+
encoding = find_codec(binary)
|
86 |
+
txt = binary.decode(encoding)
|
87 |
else:
|
88 |
with open(filename, "r") as f:
|
89 |
while True:
|
rag/app/qa.py
CHANGED
@@ -15,7 +15,7 @@ from copy import deepcopy
|
|
15 |
from io import BytesIO
|
16 |
from nltk import word_tokenize
|
17 |
from openpyxl import load_workbook
|
18 |
-
from rag.nlp import is_english, random_choices
|
19 |
from rag.nlp import huqie
|
20 |
from deepdoc.parser import ExcelParser
|
21 |
|
@@ -106,7 +106,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
106 |
callback(0.1, "Start to parse.")
|
107 |
txt = ""
|
108 |
if binary:
|
109 |
-
|
|
|
110 |
else:
|
111 |
with open(filename, "r") as f:
|
112 |
while True:
|
|
|
15 |
from io import BytesIO
|
16 |
from nltk import word_tokenize
|
17 |
from openpyxl import load_workbook
|
18 |
+
from rag.nlp import is_english, random_choices, find_codec
|
19 |
from rag.nlp import huqie
|
20 |
from deepdoc.parser import ExcelParser
|
21 |
|
|
|
106 |
callback(0.1, "Start to parse.")
|
107 |
txt = ""
|
108 |
if binary:
|
109 |
+
encoding = find_codec(binary)
|
110 |
+
txt = binary.decode(encoding)
|
111 |
else:
|
112 |
with open(filename, "r") as f:
|
113 |
while True:
|
rag/app/table.py
CHANGED
@@ -20,7 +20,7 @@ from openpyxl import load_workbook
|
|
20 |
from dateutil.parser import parse as datetime_parse
|
21 |
|
22 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
23 |
-
from rag.nlp import huqie, is_english, tokenize
|
24 |
from deepdoc.parser import ExcelParser
|
25 |
|
26 |
|
@@ -147,7 +147,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|
147 |
callback(0.1, "Start to parse.")
|
148 |
txt = ""
|
149 |
if binary:
|
150 |
-
|
|
|
151 |
else:
|
152 |
with open(filename, "r") as f:
|
153 |
while True:
|
|
|
20 |
from dateutil.parser import parse as datetime_parse
|
21 |
|
22 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
23 |
+
from rag.nlp import huqie, is_english, tokenize, find_codec
|
24 |
from deepdoc.parser import ExcelParser
|
25 |
|
26 |
|
|
|
147 |
callback(0.1, "Start to parse.")
|
148 |
txt = ""
|
149 |
if binary:
|
150 |
+
encoding = find_codec(binary)
|
151 |
+
txt = binary.decode(encoding)
|
152 |
else:
|
153 |
with open(filename, "r") as f:
|
154 |
while True:
|
rag/nlp/__init__.py
CHANGED
@@ -6,6 +6,35 @@ from . import huqie
|
|
6 |
import re
|
7 |
import copy
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
BULLET_PATTERN = [[
|
11 |
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
|
|
6 |
import re
|
7 |
import copy
|
8 |
|
9 |
+
all_codecs = [
|
10 |
+
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
11 |
+
'cp037', 'cp273', 'cp424', 'cp437',
|
12 |
+
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857',
|
13 |
+
'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869',
|
14 |
+
'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125',
|
15 |
+
'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
|
16 |
+
'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr',
|
17 |
+
'gb2312', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
|
18 |
+
'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1',
|
19 |
+
'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7',
|
20 |
+
'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13',
|
21 |
+
'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
|
22 |
+
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
|
23 |
+
'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
|
24 |
+
'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7'
|
25 |
+
]
|
26 |
+
|
27 |
+
|
28 |
+
def find_codec(blob):
|
29 |
+
global all_codecs
|
30 |
+
for c in all_codecs:
|
31 |
+
try:
|
32 |
+
blob.decode(c)
|
33 |
+
return c
|
34 |
+
except Exception as e:
|
35 |
+
pass
|
36 |
+
return "utf-8"
|
37 |
+
|
38 |
|
39 |
BULLET_PATTERN = [[
|
40 |
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
rag/nlp/huqie.py
CHANGED
@@ -8,6 +8,7 @@ import re
|
|
8 |
import string
|
9 |
import sys
|
10 |
from hanziconv import HanziConv
|
|
|
11 |
from nltk import word_tokenize
|
12 |
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
13 |
from api.utils.file_utils import get_project_base_directory
|
|
|
8 |
import string
|
9 |
import sys
|
10 |
from hanziconv import HanziConv
|
11 |
+
from huggingface_hub import snapshot_download
|
12 |
from nltk import word_tokenize
|
13 |
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
14 |
from api.utils.file_utils import get_project_base_directory
|
rag/nlp/search.py
CHANGED
@@ -68,7 +68,7 @@ class Dealer:
|
|
68 |
pg = int(req.get("page", 1)) - 1
|
69 |
ps = int(req.get("size", 1000))
|
70 |
topk = int(req.get("topk", 1024))
|
71 |
-
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
|
72 |
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
|
73 |
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
|
74 |
|
@@ -289,8 +289,18 @@ class Dealer:
|
|
289 |
sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
|
290 |
if not ins_embd:
|
291 |
return [], [], []
|
292 |
-
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
|
295 |
ins_embd,
|
296 |
keywords,
|
@@ -368,7 +378,7 @@ class Dealer:
|
|
368 |
|
369 |
def sql_retrieval(self, sql, fetch_size=128, format="json"):
|
370 |
from api.settings import chat_logger
|
371 |
-
sql = re.sub(r"[ ]+", " ", sql)
|
372 |
sql = sql.replace("%", "")
|
373 |
es_logger.info(f"Get es sql: {sql}")
|
374 |
replaces = []
|
|
|
68 |
pg = int(req.get("page", 1)) - 1
|
69 |
ps = int(req.get("size", 1000))
|
70 |
topk = int(req.get("topk", 1024))
|
71 |
+
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
|
72 |
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
|
73 |
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
|
74 |
|
|
|
289 |
sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
|
290 |
if not ins_embd:
|
291 |
return [], [], []
|
292 |
+
|
293 |
+
for i in sres.ids:
|
294 |
+
if isinstance(sres.field[i].get("important_kwd", []), str):
|
295 |
+
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
296 |
+
ins_tw = []
|
297 |
+
for i in sres.ids:
|
298 |
+
content_ltks = sres.field[i][cfield].split(" ")
|
299 |
+
title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
|
300 |
+
important_kwd = sres.field[i].get("important_kwd", [])
|
301 |
+
tks = content_ltks + title_tks + important_kwd
|
302 |
+
ins_tw.append(tks)
|
303 |
+
|
304 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
|
305 |
ins_embd,
|
306 |
keywords,
|
|
|
378 |
|
379 |
def sql_retrieval(self, sql, fetch_size=128, format="json"):
|
380 |
from api.settings import chat_logger
|
381 |
+
sql = re.sub(r"[ `]+", " ", sql)
|
382 |
sql = sql.replace("%", "")
|
383 |
es_logger.info(f"Get es sql: {sql}")
|
384 |
replaces = []
|
rag/svr/task_broker.py
CHANGED
@@ -121,6 +121,7 @@ def dispatch():
|
|
121 |
tsks.append(new_task())
|
122 |
|
123 |
bulk_insert_into_db(Task, tsks, True)
|
|
|
124 |
set_dispatching(r["id"])
|
125 |
except Exception as e:
|
126 |
cron_logger.exception(e)
|
|
|
121 |
tsks.append(new_task())
|
122 |
|
123 |
bulk_insert_into_db(Task, tsks, True)
|
124 |
+
print("TSK:", len(tsks))
|
125 |
set_dispatching(r["id"])
|
126 |
except Exception as e:
|
127 |
cron_logger.exception(e)
|
rag/svr/task_executor.py
CHANGED
@@ -19,6 +19,7 @@ import logging
|
|
19 |
import os
|
20 |
import hashlib
|
21 |
import copy
|
|
|
22 |
import re
|
23 |
import sys
|
24 |
import time
|
@@ -92,6 +93,7 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
|
92 |
|
93 |
def collect(comm, mod, tm):
|
94 |
tasks = TaskService.get_tasks(tm, mod, comm)
|
|
|
95 |
if len(tasks) == 0:
|
96 |
time.sleep(1)
|
97 |
return pd.DataFrame()
|
@@ -243,6 +245,7 @@ def main(comm, mod):
|
|
243 |
tmf = open(tm_fnm, "a+")
|
244 |
for _, r in rows.iterrows():
|
245 |
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
|
|
246 |
try:
|
247 |
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
248 |
except Exception as e:
|
@@ -300,9 +303,8 @@ if __name__ == "__main__":
|
|
300 |
peewee_logger.addHandler(database_logger.handlers[0])
|
301 |
peewee_logger.setLevel(database_logger.level)
|
302 |
|
303 |
-
from mpi4py import MPI
|
304 |
-
|
305 |
-
comm = MPI.COMM_WORLD
|
306 |
while True:
|
307 |
main(int(sys.argv[2]), int(sys.argv[1]))
|
308 |
close_connection()
|
|
|
19 |
import os
|
20 |
import hashlib
|
21 |
import copy
|
22 |
+
import random
|
23 |
import re
|
24 |
import sys
|
25 |
import time
|
|
|
93 |
|
94 |
def collect(comm, mod, tm):
|
95 |
tasks = TaskService.get_tasks(tm, mod, comm)
|
96 |
+
#print(tasks)
|
97 |
if len(tasks) == 0:
|
98 |
time.sleep(1)
|
99 |
return pd.DataFrame()
|
|
|
245 |
tmf = open(tm_fnm, "a+")
|
246 |
for _, r in rows.iterrows():
|
247 |
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
248 |
+
#callback(random.random()/10., "Task has been received.")
|
249 |
try:
|
250 |
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
251 |
except Exception as e:
|
|
|
303 |
peewee_logger.addHandler(database_logger.handlers[0])
|
304 |
peewee_logger.setLevel(database_logger.level)
|
305 |
|
306 |
+
#from mpi4py import MPI
|
307 |
+
#comm = MPI.COMM_WORLD
|
|
|
308 |
while True:
|
309 |
main(int(sys.argv[2]), int(sys.argv[1]))
|
310 |
close_connection()
|