KevinHuSh commited on
Commit
d54aa01
·
1 Parent(s): 7ba250b

Fit a lot of encodings for text file. (#458)

Browse files

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement

README.md CHANGED
@@ -72,8 +72,8 @@
72
 
73
  ### 📝 Prerequisites
74
 
75
- - CPU >= 2 cores
76
- - RAM >= 8 GB
77
  - Docker >= 24.0.0 & Docker Compose >= v2.26.1
78
  > If you have not installed Docker on your local machine (Windows, Mac, or Linux), see [Install Docker Engine](https://docs.docker.com/engine/install/).
79
 
 
72
 
73
  ### 📝 Prerequisites
74
 
75
+ - CPU >= 4 cores
76
+ - RAM >= 12 GB
77
  - Docker >= 24.0.0 & Docker Compose >= v2.26.1
78
  > If you have not installed Docker on your local machine (Windows, Mac, or Linux), see [Install Docker Engine](https://docs.docker.com/engine/install/).
79
 
README_ja.md CHANGED
@@ -72,8 +72,8 @@
72
 
73
  ### 📝 必要条件
74
 
75
- - CPU >= 2 cores
76
- - RAM >= 8 GB
77
  - Docker >= 24.0.0 & Docker Compose >= v2.26.1
78
  > ローカルマシン(Windows、Mac、または Linux)に Docker をインストールしていない場合は、[Docker Engine のインストール](https://docs.docker.com/engine/install/) を参照してください。
79
 
 
72
 
73
  ### 📝 必要条件
74
 
75
+ - CPU >= 4 cores
76
+ - RAM >= 12 GB
77
  - Docker >= 24.0.0 & Docker Compose >= v2.26.1
78
  > ローカルマシン(Windows、Mac、または Linux)に Docker をインストールしていない場合は、[Docker Engine のインストール](https://docs.docker.com/engine/install/) を参照してください。
79
 
README_zh.md CHANGED
@@ -72,8 +72,8 @@
72
 
73
  ### 📝 前提条件
74
 
75
- - CPU >= 2
76
- - RAM >= 8 GB
77
  - Docker >= 24.0.0 & Docker Compose >= v2.26.1
78
  > 如果你并没有在本机安装 Docker(Windows、Mac,或者 Linux), 可以参考文档 [Install Docker Engine](https://docs.docker.com/engine/install/) 自行安装。
79
 
 
72
 
73
  ### 📝 前提条件
74
 
75
+ - CPU >= 4
76
+ - RAM >= 12 GB
77
  - Docker >= 24.0.0 & Docker Compose >= v2.26.1
78
  > 如果你并没有在本机安装 Docker(Windows、Mac,或者 Linux), 可以参考文档 [Install Docker Engine](https://docs.docker.com/engine/install/) 自行安装。
79
 
api/apps/api_app.py CHANGED
@@ -105,7 +105,7 @@ def stats():
105
  res = {
106
  "pv": [(o["dt"], o["pv"]) for o in objs],
107
  "uv": [(o["dt"], o["uv"]) for o in objs],
108
- "speed": [(o["dt"], float(o["tokens"])/float(o["duration"])) for o in objs],
109
  "tokens": [(o["dt"], float(o["tokens"])/1000.) for o in objs],
110
  "round": [(o["dt"], o["round"]) for o in objs],
111
  "thumb_up": [(o["dt"], o["thumb_up"]) for o in objs]
@@ -176,7 +176,6 @@ def completion():
176
  conv.reference.append(ans["reference"])
177
  conv.message.append({"role": "assistant", "content": ans["answer"]})
178
  API4ConversationService.append_message(conv.id, conv.to_dict())
179
- APITokenService.APITokenService(token)
180
  return get_json_result(data=ans)
181
  except Exception as e:
182
  return server_error_response(e)
 
105
  res = {
106
  "pv": [(o["dt"], o["pv"]) for o in objs],
107
  "uv": [(o["dt"], o["uv"]) for o in objs],
108
+ "speed": [(o["dt"], float(o["tokens"])/(float(o["duration"]+0.1))) for o in objs],
109
  "tokens": [(o["dt"], float(o["tokens"])/1000.) for o in objs],
110
  "round": [(o["dt"], o["round"]) for o in objs],
111
  "thumb_up": [(o["dt"], o["thumb_up"]) for o in objs]
 
176
  conv.reference.append(ans["reference"])
177
  conv.message.append({"role": "assistant", "content": ans["answer"]})
178
  API4ConversationService.append_message(conv.id, conv.to_dict())
 
179
  return get_json_result(data=ans)
180
  except Exception as e:
181
  return server_error_response(e)
api/apps/user_app.py CHANGED
@@ -14,6 +14,7 @@
14
  # limitations under the License.
15
  #
16
  import re
 
17
 
18
  from flask import request, session, redirect
19
  from werkzeug.security import generate_password_hash, check_password_hash
@@ -22,7 +23,7 @@ from flask_login import login_required, current_user, login_user, logout_user
22
  from api.db.db_models import TenantLLM
23
  from api.db.services.llm_service import TenantLLMService, LLMService
24
  from api.utils.api_utils import server_error_response, validate_request
25
- from api.utils import get_uuid, get_format_time, decrypt, download_img
26
  from api.db import UserTenantRole, LLMType
27
  from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
28
  LLM_FACTORY, LLM_BASE_URL
@@ -56,6 +57,8 @@ def login():
56
  response_data = user.to_json()
57
  user.access_token = get_uuid()
58
  login_user(user)
 
 
59
  user.save()
60
  msg = "Welcome back!"
61
  return cors_reponse(data=response_data, auth=user.get_id(), retmsg=msg)
 
14
  # limitations under the License.
15
  #
16
  import re
17
+ from datetime import datetime
18
 
19
  from flask import request, session, redirect
20
  from werkzeug.security import generate_password_hash, check_password_hash
 
23
  from api.db.db_models import TenantLLM
24
  from api.db.services.llm_service import TenantLLMService, LLMService
25
  from api.utils.api_utils import server_error_response, validate_request
26
+ from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
27
  from api.db import UserTenantRole, LLMType
28
  from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
29
  LLM_FACTORY, LLM_BASE_URL
 
57
  response_data = user.to_json()
58
  user.access_token = get_uuid()
59
  login_user(user)
60
+ user.update_time = current_timestamp(),
61
+ user.update_date = datetime_format(datetime.now()),
62
  user.save()
63
  msg = "Welcome back!"
64
  return cors_reponse(data=response_data, auth=user.get_id(), retmsg=msg)
api/db/services/api_service.py CHANGED
@@ -40,8 +40,8 @@ class API4ConversationService(CommonService):
40
  @classmethod
41
  @DB.connection_context()
42
  def append_message(cls, id, conversation):
43
- cls.model.update_by_id(id, conversation)
44
- return cls.model.update(round=cls.model.round + 1).where(id=id).execute()
45
 
46
  @classmethod
47
  @DB.connection_context()
 
40
  @classmethod
41
  @DB.connection_context()
42
  def append_message(cls, id, conversation):
43
+ cls.update_by_id(id, conversation)
44
+ return cls.model.update(round=cls.model.round + 1).where(cls.model.id==id).execute()
45
 
46
  @classmethod
47
  @DB.connection_context()
api/db/services/task_service.py CHANGED
@@ -13,6 +13,8 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
 
16
  from peewee import Expression
17
  from api.db.db_models import DB
18
  from api.db import StatusEnum, FileType, TaskStatus
@@ -26,7 +28,7 @@ class TaskService(CommonService):
26
 
27
  @classmethod
28
  @DB.connection_context()
29
- def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
30
  fields = [
31
  cls.model.id,
32
  cls.model.doc_id,
@@ -45,20 +47,28 @@ class TaskService(CommonService):
45
  Tenant.img2txt_id,
46
  Tenant.asr_id,
47
  cls.model.update_time]
48
- docs = cls.model.select(*fields) \
49
- .join(Document, on=(cls.model.doc_id == Document.id)) \
50
- .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
51
- .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
52
- .where(
53
- Document.status == StatusEnum.VALID.value,
54
- Document.run == TaskStatus.RUNNING.value,
55
- ~(Document.type == FileType.VIRTUAL.value),
56
- cls.model.progress == 0,
57
- cls.model.update_time >= tm,
58
- (Expression(cls.model.create_time, "%%", comm) == mod))\
59
- .order_by(cls.model.update_time.asc())\
60
- .paginate(1, items_per_page)
61
- return list(docs.dicts())
 
 
 
 
 
 
 
 
62
 
63
  @classmethod
64
  @DB.connection_context()
@@ -74,9 +84,10 @@ class TaskService(CommonService):
74
  @classmethod
75
  @DB.connection_context()
76
  def update_progress(cls, id, info):
77
- if info["progress_msg"]:
78
- cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
79
- cls.model.id == id).execute()
80
- if "progress" in info:
81
- cls.model.update(progress=info["progress"]).where(
82
- cls.model.id == id).execute()
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ import random
17
+
18
  from peewee import Expression
19
  from api.db.db_models import DB
20
  from api.db import StatusEnum, FileType, TaskStatus
 
28
 
29
  @classmethod
30
  @DB.connection_context()
31
+ def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
32
  fields = [
33
  cls.model.id,
34
  cls.model.doc_id,
 
47
  Tenant.img2txt_id,
48
  Tenant.asr_id,
49
  cls.model.update_time]
50
+ with DB.lock("get_task", -1):
51
+ docs = cls.model.select(*fields) \
52
+ .join(Document, on=(cls.model.doc_id == Document.id)) \
53
+ .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
54
+ .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
55
+ .where(
56
+ Document.status == StatusEnum.VALID.value,
57
+ Document.run == TaskStatus.RUNNING.value,
58
+ ~(Document.type == FileType.VIRTUAL.value),
59
+ cls.model.progress == 0,
60
+ #cls.model.update_time >= tm,
61
+ #(Expression(cls.model.create_time, "%%", comm) == mod)
62
+ )\
63
+ .order_by(cls.model.update_time.asc())\
64
+ .paginate(0, items_per_page)
65
+ docs = list(docs.dicts())
66
+ if not docs: return []
67
+ if not takeit: return docs
68
+
69
+ cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
70
+ cls.model.id == docs[0]["id"]).execute()
71
+ return docs
72
 
73
  @classmethod
74
  @DB.connection_context()
 
84
  @classmethod
85
  @DB.connection_context()
86
  def update_progress(cls, id, info):
87
+ with DB.lock("update_progress", -1):
88
+ if info["progress_msg"]:
89
+ cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
90
+ cls.model.id == id).execute()
91
+ if "progress" in info:
92
+ cls.model.update(progress=info["progress"]).where(
93
+ cls.model.id == id).execute()
deepdoc/parser/excel_parser.py CHANGED
@@ -3,6 +3,8 @@ from openpyxl import load_workbook
3
  import sys
4
  from io import BytesIO
5
 
 
 
6
 
7
  class HuExcelParser:
8
  def html(self, fnm):
@@ -66,7 +68,8 @@ class HuExcelParser:
66
  return total
67
 
68
  if fnm.split(".")[-1].lower() in ["csv", "txt"]:
69
- txt = binary.decode("utf-8")
 
70
  return len(txt.split("\n"))
71
 
72
 
 
3
  import sys
4
  from io import BytesIO
5
 
6
+ from rag.nlp import find_codec
7
+
8
 
9
  class HuExcelParser:
10
  def html(self, fnm):
 
68
  return total
69
 
70
  if fnm.split(".")[-1].lower() in ["csv", "txt"]:
71
+ encoding = find_codec(binary)
72
+ txt = binary.decode(encoding)
73
  return len(txt.split("\n"))
74
 
75
 
rag/app/book.py CHANGED
@@ -15,7 +15,8 @@ import re
15
  from io import BytesIO
16
 
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
18
- hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, tokenize_chunks
 
19
  from rag.nlp import huqie
20
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
21
 
@@ -87,7 +88,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
87
  callback(0.1, "Start to parse.")
88
  txt = ""
89
  if binary:
90
- txt = binary.decode("utf-8")
 
91
  else:
92
  with open(filename, "r") as f:
93
  while True:
 
15
  from io import BytesIO
16
 
17
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
18
+ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
19
+ tokenize_chunks, find_codec
20
  from rag.nlp import huqie
21
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
22
 
 
88
  callback(0.1, "Start to parse.")
89
  txt = ""
90
  if binary:
91
+ encoding = find_codec(binary)
92
+ txt = binary.decode(encoding)
93
  else:
94
  with open(filename, "r") as f:
95
  while True:
rag/app/laws.py CHANGED
@@ -17,7 +17,7 @@ from docx import Document
17
 
18
  from api.db import ParserType
19
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
20
- make_colon_as_title, add_positions, tokenize_chunks
21
  from rag.nlp import huqie
22
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
23
  from rag.settings import cron_logger
@@ -111,7 +111,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
111
  callback(0.1, "Start to parse.")
112
  txt = ""
113
  if binary:
114
- txt = binary.decode("utf-8")
 
115
  else:
116
  with open(filename, "r") as f:
117
  while True:
 
17
 
18
  from api.db import ParserType
19
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
20
+ make_colon_as_title, add_positions, tokenize_chunks, find_codec
21
  from rag.nlp import huqie
22
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
23
  from rag.settings import cron_logger
 
111
  callback(0.1, "Start to parse.")
112
  txt = ""
113
  if binary:
114
+ encoding = find_codec(binary)
115
+ txt = binary.decode(encoding)
116
  else:
117
  with open(filename, "r") as f:
118
  while True:
rag/app/naive.py CHANGED
@@ -14,7 +14,7 @@ from io import BytesIO
14
  from docx import Document
15
  import re
16
  from deepdoc.parser.pdf_parser import PlainParser
17
- from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
18
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
19
  from rag.settings import cron_logger
20
 
@@ -139,10 +139,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
139
  callback(0.1, "Start to parse.")
140
  txt = ""
141
  if binary:
142
- try:
143
- txt = binary.decode("utf-8")
144
- except Exception as e:
145
- txt = binary.decode("gb2312")
146
  else:
147
  with open(filename, "r") as f:
148
  while True:
 
14
  from docx import Document
15
  import re
16
  from deepdoc.parser.pdf_parser import PlainParser
17
+ from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
18
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
19
  from rag.settings import cron_logger
20
 
 
139
  callback(0.1, "Start to parse.")
140
  txt = ""
141
  if binary:
142
+ encoding = find_codec(binary)
143
+ txt = binary.decode(encoding)
 
 
144
  else:
145
  with open(filename, "r") as f:
146
  while True:
rag/app/one.py CHANGED
@@ -12,7 +12,7 @@
12
  #
13
  import re
14
  from rag.app import laws
15
- from rag.nlp import huqie, tokenize
16
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser
17
 
18
 
@@ -82,7 +82,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
82
  callback(0.1, "Start to parse.")
83
  txt = ""
84
  if binary:
85
- txt = binary.decode("utf-8")
 
86
  else:
87
  with open(filename, "r") as f:
88
  while True:
 
12
  #
13
  import re
14
  from rag.app import laws
15
+ from rag.nlp import huqie, tokenize, find_codec
16
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser
17
 
18
 
 
82
  callback(0.1, "Start to parse.")
83
  txt = ""
84
  if binary:
85
+ encoding = find_codec(binary)
86
+ txt = binary.decode(encoding)
87
  else:
88
  with open(filename, "r") as f:
89
  while True:
rag/app/qa.py CHANGED
@@ -15,7 +15,7 @@ from copy import deepcopy
15
  from io import BytesIO
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
18
- from rag.nlp import is_english, random_choices
19
  from rag.nlp import huqie
20
  from deepdoc.parser import ExcelParser
21
 
@@ -106,7 +106,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
106
  callback(0.1, "Start to parse.")
107
  txt = ""
108
  if binary:
109
- txt = binary.decode("utf-8")
 
110
  else:
111
  with open(filename, "r") as f:
112
  while True:
 
15
  from io import BytesIO
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
18
+ from rag.nlp import is_english, random_choices, find_codec
19
  from rag.nlp import huqie
20
  from deepdoc.parser import ExcelParser
21
 
 
106
  callback(0.1, "Start to parse.")
107
  txt = ""
108
  if binary:
109
+ encoding = find_codec(binary)
110
+ txt = binary.decode(encoding)
111
  else:
112
  with open(filename, "r") as f:
113
  while True:
rag/app/table.py CHANGED
@@ -20,7 +20,7 @@ from openpyxl import load_workbook
20
  from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
- from rag.nlp import huqie, is_english, tokenize
24
  from deepdoc.parser import ExcelParser
25
 
26
 
@@ -147,7 +147,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
147
  callback(0.1, "Start to parse.")
148
  txt = ""
149
  if binary:
150
- txt = binary.decode("utf-8")
 
151
  else:
152
  with open(filename, "r") as f:
153
  while True:
 
20
  from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
+ from rag.nlp import huqie, is_english, tokenize, find_codec
24
  from deepdoc.parser import ExcelParser
25
 
26
 
 
147
  callback(0.1, "Start to parse.")
148
  txt = ""
149
  if binary:
150
+ encoding = find_codec(binary)
151
+ txt = binary.decode(encoding)
152
  else:
153
  with open(filename, "r") as f:
154
  while True:
rag/nlp/__init__.py CHANGED
@@ -6,6 +6,35 @@ from . import huqie
6
  import re
7
  import copy
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  BULLET_PATTERN = [[
11
  r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
 
6
  import re
7
  import copy
8
 
9
+ all_codecs = [
10
+ 'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
11
+ 'cp037', 'cp273', 'cp424', 'cp437',
12
+ 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857',
13
+ 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869',
14
+ 'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125',
15
+ 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
16
+ 'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr',
17
+ 'gb2312', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
18
+ 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1',
19
+ 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7',
20
+ 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13',
21
+ 'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
22
+ 'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
23
+ 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
24
+ 'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7'
25
+ ]
26
+
27
+
28
+ def find_codec(blob):
29
+ global all_codecs
30
+ for c in all_codecs:
31
+ try:
32
+ blob.decode(c)
33
+ return c
34
+ except Exception as e:
35
+ pass
36
+ return "utf-8"
37
+
38
 
39
  BULLET_PATTERN = [[
40
  r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
rag/nlp/huqie.py CHANGED
@@ -8,6 +8,7 @@ import re
8
  import string
9
  import sys
10
  from hanziconv import HanziConv
 
11
  from nltk import word_tokenize
12
  from nltk.stem import PorterStemmer, WordNetLemmatizer
13
  from api.utils.file_utils import get_project_base_directory
 
8
  import string
9
  import sys
10
  from hanziconv import HanziConv
11
+ from huggingface_hub import snapshot_download
12
  from nltk import word_tokenize
13
  from nltk.stem import PorterStemmer, WordNetLemmatizer
14
  from api.utils.file_utils import get_project_base_directory
rag/nlp/search.py CHANGED
@@ -68,7 +68,7 @@ class Dealer:
68
  pg = int(req.get("page", 1)) - 1
69
  ps = int(req.get("size", 1000))
70
  topk = int(req.get("topk", 1024))
71
- src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
72
  "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
73
  "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
74
 
@@ -289,8 +289,18 @@ class Dealer:
289
  sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
290
  if not ins_embd:
291
  return [], [], []
292
- ins_tw = [sres.field[i][cfield].split(" ")
293
- for i in sres.ids]
 
 
 
 
 
 
 
 
 
 
294
  sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
295
  ins_embd,
296
  keywords,
@@ -368,7 +378,7 @@ class Dealer:
368
 
369
  def sql_retrieval(self, sql, fetch_size=128, format="json"):
370
  from api.settings import chat_logger
371
- sql = re.sub(r"[ ]+", " ", sql)
372
  sql = sql.replace("%", "")
373
  es_logger.info(f"Get es sql: {sql}")
374
  replaces = []
 
68
  pg = int(req.get("page", 1)) - 1
69
  ps = int(req.get("size", 1000))
70
  topk = int(req.get("topk", 1024))
71
+ src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
72
  "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
73
  "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
74
 
 
289
  sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
290
  if not ins_embd:
291
  return [], [], []
292
+
293
+ for i in sres.ids:
294
+ if isinstance(sres.field[i].get("important_kwd", []), str):
295
+ sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
296
+ ins_tw = []
297
+ for i in sres.ids:
298
+ content_ltks = sres.field[i][cfield].split(" ")
299
+ title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
300
+ important_kwd = sres.field[i].get("important_kwd", [])
301
+ tks = content_ltks + title_tks + important_kwd
302
+ ins_tw.append(tks)
303
+
304
  sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
305
  ins_embd,
306
  keywords,
 
378
 
379
  def sql_retrieval(self, sql, fetch_size=128, format="json"):
380
  from api.settings import chat_logger
381
+ sql = re.sub(r"[ `]+", " ", sql)
382
  sql = sql.replace("%", "")
383
  es_logger.info(f"Get es sql: {sql}")
384
  replaces = []
rag/svr/task_broker.py CHANGED
@@ -121,6 +121,7 @@ def dispatch():
121
  tsks.append(new_task())
122
 
123
  bulk_insert_into_db(Task, tsks, True)
 
124
  set_dispatching(r["id"])
125
  except Exception as e:
126
  cron_logger.exception(e)
 
121
  tsks.append(new_task())
122
 
123
  bulk_insert_into_db(Task, tsks, True)
124
+ print("TSK:", len(tsks))
125
  set_dispatching(r["id"])
126
  except Exception as e:
127
  cron_logger.exception(e)
rag/svr/task_executor.py CHANGED
@@ -19,6 +19,7 @@ import logging
19
  import os
20
  import hashlib
21
  import copy
 
22
  import re
23
  import sys
24
  import time
@@ -92,6 +93,7 @@ def set_progress(task_id, from_page=0, to_page=-1,
92
 
93
  def collect(comm, mod, tm):
94
  tasks = TaskService.get_tasks(tm, mod, comm)
 
95
  if len(tasks) == 0:
96
  time.sleep(1)
97
  return pd.DataFrame()
@@ -243,6 +245,7 @@ def main(comm, mod):
243
  tmf = open(tm_fnm, "a+")
244
  for _, r in rows.iterrows():
245
  callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
 
246
  try:
247
  embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
248
  except Exception as e:
@@ -300,9 +303,8 @@ if __name__ == "__main__":
300
  peewee_logger.addHandler(database_logger.handlers[0])
301
  peewee_logger.setLevel(database_logger.level)
302
 
303
- from mpi4py import MPI
304
-
305
- comm = MPI.COMM_WORLD
306
  while True:
307
  main(int(sys.argv[2]), int(sys.argv[1]))
308
  close_connection()
 
19
  import os
20
  import hashlib
21
  import copy
22
+ import random
23
  import re
24
  import sys
25
  import time
 
93
 
94
  def collect(comm, mod, tm):
95
  tasks = TaskService.get_tasks(tm, mod, comm)
96
+ #print(tasks)
97
  if len(tasks) == 0:
98
  time.sleep(1)
99
  return pd.DataFrame()
 
245
  tmf = open(tm_fnm, "a+")
246
  for _, r in rows.iterrows():
247
  callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
248
+ #callback(random.random()/10., "Task has been received.")
249
  try:
250
  embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
251
  except Exception as e:
 
303
  peewee_logger.addHandler(database_logger.handlers[0])
304
  peewee_logger.setLevel(database_logger.level)
305
 
306
+ #from mpi4py import MPI
307
+ #comm = MPI.COMM_WORLD
 
308
  while True:
309
  main(int(sys.argv[2]), int(sys.argv[1]))
310
  close_connection()