Rework logging (#3358)
Browse filesUnified all log files into one.
### What problem does this PR solve?
Unified all log files into one.
### Type of change
- [x] Refactoring
This view is limited to 50 files because it contains too many changes.
See raw diff
- agent/canvas.py +5 -7
- agent/component/arxiv.py +2 -3
- agent/component/baidu.py +2 -4
- agent/component/base.py +7 -7
- agent/component/bing.py +2 -3
- agent/component/categorize.py +3 -3
- agent/component/duckduckgo.py +2 -2
- agent/component/github.py +2 -2
- agent/component/google.py +3 -3
- agent/component/googlescholar.py +4 -4
- agent/component/keyword.py +2 -2
- agent/component/pubmed.py +2 -2
- agent/component/relevant.py +2 -1
- agent/component/retrieval.py +2 -1
- agent/component/rewrite.py +2 -1
- agent/component/wikipedia.py +2 -4
- agent/component/yahoofinance.py +3 -2
- agent/settings.py +0 -16
- api/apps/__init__.py +5 -10
- api/apps/canvas_app.py +2 -1
- api/apps/llm_app.py +2 -1
- api/apps/sdk/dataset.py +1 -1
- api/apps/user_app.py +7 -7
- api/db/db_models.py +8 -11
- api/db/db_utils.py +0 -6
- api/db/init_data.py +14 -16
- api/db/operatioins.py +0 -21
- api/db/services/dialog_service.py +10 -13
- api/db/services/document_service.py +4 -4
- api/db/services/file_service.py +5 -4
- api/db/services/llm_service.py +17 -17
- api/ragflow_server.py +14 -18
- api/settings.py +0 -17
- api/utils/api_utils.py +4 -3
- api/utils/log_utils.py +25 -287
- deepdoc/parser/pdf_parser.py +25 -24
- deepdoc/parser/resume/entities/corporations.py +9 -3
- deepdoc/parser/resume/step_two.py +20 -15
- deepdoc/vision/operators.py +2 -2
- deepdoc/vision/recognizer.py +2 -1
- deepdoc/vision/seeit.py +2 -1
- deepdoc/vision/t_recognizer.py +5 -2
- graphrag/claim_extractor.py +3 -4
- graphrag/community_reports_extractor.py +4 -7
- graphrag/index.py +3 -2
- graphrag/mind_map_extractor.py +3 -3
- intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py +3 -3
- rag/app/book.py +2 -1
- rag/app/email.py +2 -2
- rag/app/laws.py +3 -3
agent/canvas.py
CHANGED
@@ -14,14 +14,12 @@
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import json
|
17 |
-
import traceback
|
18 |
from abc import ABC
|
19 |
from copy import deepcopy
|
20 |
from functools import partial
|
21 |
from agent.component import component_class
|
22 |
from agent.component.base import ComponentBase
|
23 |
-
from
|
24 |
-
|
25 |
|
26 |
class Canvas(ABC):
|
27 |
"""
|
@@ -189,7 +187,7 @@ class Canvas(ABC):
|
|
189 |
if cpn.component_name == "Answer":
|
190 |
self.answer.append(c)
|
191 |
else:
|
192 |
-
|
193 |
cpids = cpn.get_dependent_components()
|
194 |
if any([c not in self.path[-1] for c in cpids]):
|
195 |
continue
|
@@ -199,7 +197,7 @@ class Canvas(ABC):
|
|
199 |
|
200 |
prepare2run(self.components[self.path[-2][-1]]["downstream"])
|
201 |
while 0 <= ran < len(self.path[-1]):
|
202 |
-
|
203 |
cpn_id = self.path[-1][ran]
|
204 |
cpn = self.get_component(cpn_id)
|
205 |
if not cpn["downstream"]: break
|
@@ -219,7 +217,7 @@ class Canvas(ABC):
|
|
219 |
self.get_component(p)["obj"].set_exception(e)
|
220 |
prepare2run([p])
|
221 |
break
|
222 |
-
|
223 |
break
|
224 |
continue
|
225 |
|
@@ -231,7 +229,7 @@ class Canvas(ABC):
|
|
231 |
self.get_component(p)["obj"].set_exception(e)
|
232 |
prepare2run([p])
|
233 |
break
|
234 |
-
|
235 |
break
|
236 |
|
237 |
if self.answer:
|
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import json
|
|
|
17 |
from abc import ABC
|
18 |
from copy import deepcopy
|
19 |
from functools import partial
|
20 |
from agent.component import component_class
|
21 |
from agent.component.base import ComponentBase
|
22 |
+
from api.utils.log_utils import logger
|
|
|
23 |
|
24 |
class Canvas(ABC):
|
25 |
"""
|
|
|
187 |
if cpn.component_name == "Answer":
|
188 |
self.answer.append(c)
|
189 |
else:
|
190 |
+
logger.debug(f"Canvas.prepare2run: {c}")
|
191 |
cpids = cpn.get_dependent_components()
|
192 |
if any([c not in self.path[-1] for c in cpids]):
|
193 |
continue
|
|
|
197 |
|
198 |
prepare2run(self.components[self.path[-2][-1]]["downstream"])
|
199 |
while 0 <= ran < len(self.path[-1]):
|
200 |
+
logger.debug(f"Canvas.run: {ran} {self.path}")
|
201 |
cpn_id = self.path[-1][ran]
|
202 |
cpn = self.get_component(cpn_id)
|
203 |
if not cpn["downstream"]: break
|
|
|
217 |
self.get_component(p)["obj"].set_exception(e)
|
218 |
prepare2run([p])
|
219 |
break
|
220 |
+
logger.exception("Canvas.run got exception")
|
221 |
break
|
222 |
continue
|
223 |
|
|
|
229 |
self.get_component(p)["obj"].set_exception(e)
|
230 |
prepare2run([p])
|
231 |
break
|
232 |
+
logger.exception("Canvas.run got exception")
|
233 |
break
|
234 |
|
235 |
if self.answer:
|
agent/component/arxiv.py
CHANGED
@@ -16,9 +16,8 @@
|
|
16 |
from abc import ABC
|
17 |
import arxiv
|
18 |
import pandas as pd
|
19 |
-
from agent.settings import DEBUG
|
20 |
from agent.component.base import ComponentBase, ComponentParamBase
|
21 |
-
|
22 |
|
23 |
class ArXivParam(ComponentParamBase):
|
24 |
"""
|
@@ -65,5 +64,5 @@ class ArXiv(ComponentBase, ABC):
|
|
65 |
return ArXiv.be_output("")
|
66 |
|
67 |
df = pd.DataFrame(arxiv_res)
|
68 |
-
|
69 |
return df
|
|
|
16 |
from abc import ABC
|
17 |
import arxiv
|
18 |
import pandas as pd
|
|
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
class ArXivParam(ComponentParamBase):
|
23 |
"""
|
|
|
64 |
return ArXiv.be_output("")
|
65 |
|
66 |
df = pd.DataFrame(arxiv_res)
|
67 |
+
logger.debug(f"df: {str(df)}")
|
68 |
return df
|
agent/component/baidu.py
CHANGED
@@ -13,14 +13,12 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
-
import random
|
17 |
from abc import ABC
|
18 |
-
from functools import partial
|
19 |
import pandas as pd
|
20 |
import requests
|
21 |
import re
|
22 |
-
from agent.settings import DEBUG
|
23 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
24 |
|
25 |
|
26 |
class BaiduParam(ComponentParamBase):
|
@@ -64,6 +62,6 @@ class Baidu(ComponentBase, ABC):
|
|
64 |
return Baidu.be_output("")
|
65 |
|
66 |
df = pd.DataFrame(baidu_res)
|
67 |
-
|
68 |
return df
|
69 |
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
16 |
from abc import ABC
|
|
|
17 |
import pandas as pd
|
18 |
import requests
|
19 |
import re
|
|
|
20 |
from agent.component.base import ComponentBase, ComponentParamBase
|
21 |
+
from api.utils.log_utils import logger
|
22 |
|
23 |
|
24 |
class BaiduParam(ComponentParamBase):
|
|
|
62 |
return Baidu.be_output("")
|
63 |
|
64 |
df = pd.DataFrame(baidu_res)
|
65 |
+
logger.debug(f"df: {str(df)}")
|
66 |
return df
|
67 |
|
agent/component/base.py
CHANGED
@@ -17,14 +17,14 @@ from abc import ABC
|
|
17 |
import builtins
|
18 |
import json
|
19 |
import os
|
20 |
-
from copy import deepcopy
|
21 |
from functools import partial
|
22 |
-
from typing import
|
23 |
|
24 |
import pandas as pd
|
25 |
|
26 |
from agent import settings
|
27 |
-
from
|
|
|
28 |
|
29 |
_FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
|
30 |
_DEPRECATED_PARAMS = "_deprecated_params"
|
@@ -361,13 +361,13 @@ class ComponentParamBase(ABC):
|
|
361 |
|
362 |
def _warn_deprecated_param(self, param_name, descr):
|
363 |
if self._deprecated_params_set.get(param_name):
|
364 |
-
|
365 |
f"{descr} {param_name} is deprecated and ignored in this version."
|
366 |
)
|
367 |
|
368 |
def _warn_to_deprecate_param(self, param_name, descr, new_param):
|
369 |
if self._deprecated_params_set.get(param_name):
|
370 |
-
|
371 |
f"{descr} {param_name} will be deprecated in future release; "
|
372 |
f"please use {new_param} instead."
|
373 |
)
|
@@ -403,7 +403,7 @@ class ComponentBase(ABC):
|
|
403 |
return cpnts
|
404 |
|
405 |
def run(self, history, **kwargs):
|
406 |
-
|
407 |
json.dumps(kwargs, ensure_ascii=False)))
|
408 |
try:
|
409 |
res = self._run(history, **kwargs)
|
@@ -463,7 +463,7 @@ class ComponentBase(ABC):
|
|
463 |
reversed_cpnts.extend(self._canvas.path[-2])
|
464 |
reversed_cpnts.extend(self._canvas.path[-1])
|
465 |
|
466 |
-
|
467 |
for u in reversed_cpnts[::-1]:
|
468 |
if self.get_component_name(u) in ["switch", "concentrator"]: continue
|
469 |
if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval":
|
|
|
17 |
import builtins
|
18 |
import json
|
19 |
import os
|
|
|
20 |
from functools import partial
|
21 |
+
from typing import Tuple, Union
|
22 |
|
23 |
import pandas as pd
|
24 |
|
25 |
from agent import settings
|
26 |
+
from api.utils.log_utils import logger
|
27 |
+
|
28 |
|
29 |
_FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
|
30 |
_DEPRECATED_PARAMS = "_deprecated_params"
|
|
|
361 |
|
362 |
def _warn_deprecated_param(self, param_name, descr):
|
363 |
if self._deprecated_params_set.get(param_name):
|
364 |
+
logger.warning(
|
365 |
f"{descr} {param_name} is deprecated and ignored in this version."
|
366 |
)
|
367 |
|
368 |
def _warn_to_deprecate_param(self, param_name, descr, new_param):
|
369 |
if self._deprecated_params_set.get(param_name):
|
370 |
+
logger.warning(
|
371 |
f"{descr} {param_name} will be deprecated in future release; "
|
372 |
f"please use {new_param} instead."
|
373 |
)
|
|
|
403 |
return cpnts
|
404 |
|
405 |
def run(self, history, **kwargs):
|
406 |
+
logger.info("{}, history: {}, kwargs: {}".format(self, json.dumps(history, ensure_ascii=False),
|
407 |
json.dumps(kwargs, ensure_ascii=False)))
|
408 |
try:
|
409 |
res = self._run(history, **kwargs)
|
|
|
463 |
reversed_cpnts.extend(self._canvas.path[-2])
|
464 |
reversed_cpnts.extend(self._canvas.path[-1])
|
465 |
|
466 |
+
logger.debug(f"{self.component_name} {reversed_cpnts[::-1]}")
|
467 |
for u in reversed_cpnts[::-1]:
|
468 |
if self.get_component_name(u) in ["switch", "concentrator"]: continue
|
469 |
if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval":
|
agent/component/bing.py
CHANGED
@@ -16,9 +16,8 @@
|
|
16 |
from abc import ABC
|
17 |
import requests
|
18 |
import pandas as pd
|
19 |
-
from agent.settings import DEBUG
|
20 |
from agent.component.base import ComponentBase, ComponentParamBase
|
21 |
-
|
22 |
|
23 |
class BingParam(ComponentParamBase):
|
24 |
"""
|
@@ -81,5 +80,5 @@ class Bing(ComponentBase, ABC):
|
|
81 |
return Bing.be_output("")
|
82 |
|
83 |
df = pd.DataFrame(bing_res)
|
84 |
-
|
85 |
return df
|
|
|
16 |
from abc import ABC
|
17 |
import requests
|
18 |
import pandas as pd
|
|
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
class BingParam(ComponentParamBase):
|
23 |
"""
|
|
|
80 |
return Bing.be_output("")
|
81 |
|
82 |
df = pd.DataFrame(bing_res)
|
83 |
+
logger.debug(f"df: {str(df)}")
|
84 |
return df
|
agent/component/categorize.py
CHANGED
@@ -17,7 +17,7 @@ from abc import ABC
|
|
17 |
from api.db import LLMType
|
18 |
from api.db.services.llm_service import LLMBundle
|
19 |
from agent.component import GenerateParam, Generate
|
20 |
-
from
|
21 |
|
22 |
|
23 |
class CategorizeParam(GenerateParam):
|
@@ -34,7 +34,7 @@ class CategorizeParam(GenerateParam):
|
|
34 |
super().check()
|
35 |
self.check_empty(self.category_description, "[Categorize] Category examples")
|
36 |
for k, v in self.category_description.items():
|
37 |
-
if not k: raise ValueError(
|
38 |
if not v.get("to"): raise ValueError(f"[Categorize] 'To' of category {k} can not be empty!")
|
39 |
|
40 |
def get_prompt(self):
|
@@ -77,7 +77,7 @@ class Categorize(Generate, ABC):
|
|
77 |
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
78 |
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": input}],
|
79 |
self._param.gen_conf())
|
80 |
-
|
81 |
for c in self._param.category_description.keys():
|
82 |
if ans.lower().find(c.lower()) >= 0:
|
83 |
return Categorize.be_output(self._param.category_description[c]["to"])
|
|
|
17 |
from api.db import LLMType
|
18 |
from api.db.services.llm_service import LLMBundle
|
19 |
from agent.component import GenerateParam, Generate
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class CategorizeParam(GenerateParam):
|
|
|
34 |
super().check()
|
35 |
self.check_empty(self.category_description, "[Categorize] Category examples")
|
36 |
for k, v in self.category_description.items():
|
37 |
+
if not k: raise ValueError("[Categorize] Category name can not be empty!")
|
38 |
if not v.get("to"): raise ValueError(f"[Categorize] 'To' of category {k} can not be empty!")
|
39 |
|
40 |
def get_prompt(self):
|
|
|
77 |
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT, self._param.llm_id)
|
78 |
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": input}],
|
79 |
self._param.gen_conf())
|
80 |
+
logger.debug(f"input: {input}, answer: {str(ans)}")
|
81 |
for c in self._param.category_description.keys():
|
82 |
if ans.lower().find(c.lower()) >= 0:
|
83 |
return Categorize.be_output(self._param.category_description[c]["to"])
|
agent/component/duckduckgo.py
CHANGED
@@ -16,8 +16,8 @@
|
|
16 |
from abc import ABC
|
17 |
from duckduckgo_search import DDGS
|
18 |
import pandas as pd
|
19 |
-
from agent.settings import DEBUG
|
20 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
21 |
|
22 |
|
23 |
class DuckDuckGoParam(ComponentParamBase):
|
@@ -62,5 +62,5 @@ class DuckDuckGo(ComponentBase, ABC):
|
|
62 |
return DuckDuckGo.be_output("")
|
63 |
|
64 |
df = pd.DataFrame(duck_res)
|
65 |
-
|
66 |
return df
|
|
|
16 |
from abc import ABC
|
17 |
from duckduckgo_search import DDGS
|
18 |
import pandas as pd
|
|
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class DuckDuckGoParam(ComponentParamBase):
|
|
|
62 |
return DuckDuckGo.be_output("")
|
63 |
|
64 |
df = pd.DataFrame(duck_res)
|
65 |
+
logger.debug("df: {df}")
|
66 |
return df
|
agent/component/github.py
CHANGED
@@ -16,8 +16,8 @@
|
|
16 |
from abc import ABC
|
17 |
import pandas as pd
|
18 |
import requests
|
19 |
-
from agent.settings import DEBUG
|
20 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
21 |
|
22 |
|
23 |
class GitHubParam(ComponentParamBase):
|
@@ -57,5 +57,5 @@ class GitHub(ComponentBase, ABC):
|
|
57 |
return GitHub.be_output("")
|
58 |
|
59 |
df = pd.DataFrame(github_res)
|
60 |
-
|
61 |
return df
|
|
|
16 |
from abc import ABC
|
17 |
import pandas as pd
|
18 |
import requests
|
|
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class GitHubParam(ComponentParamBase):
|
|
|
57 |
return GitHub.be_output("")
|
58 |
|
59 |
df = pd.DataFrame(github_res)
|
60 |
+
logger.debug(f"df: {df}")
|
61 |
return df
|
agent/component/google.py
CHANGED
@@ -16,8 +16,8 @@
|
|
16 |
from abc import ABC
|
17 |
from serpapi import GoogleSearch
|
18 |
import pandas as pd
|
19 |
-
from agent.settings import DEBUG
|
20 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
21 |
|
22 |
|
23 |
class GoogleParam(ComponentParamBase):
|
@@ -85,12 +85,12 @@ class Google(ComponentBase, ABC):
|
|
85 |
"hl": self._param.language, "num": self._param.top_n})
|
86 |
google_res = [{"content": '<a href="' + i["link"] + '">' + i["title"] + '</a> ' + i["snippet"]} for i in
|
87 |
client.get_dict()["organic_results"]]
|
88 |
-
except Exception
|
89 |
return Google.be_output("**ERROR**: Existing Unavailable Parameters!")
|
90 |
|
91 |
if not google_res:
|
92 |
return Google.be_output("")
|
93 |
|
94 |
df = pd.DataFrame(google_res)
|
95 |
-
|
96 |
return df
|
|
|
16 |
from abc import ABC
|
17 |
from serpapi import GoogleSearch
|
18 |
import pandas as pd
|
|
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class GoogleParam(ComponentParamBase):
|
|
|
85 |
"hl": self._param.language, "num": self._param.top_n})
|
86 |
google_res = [{"content": '<a href="' + i["link"] + '">' + i["title"] + '</a> ' + i["snippet"]} for i in
|
87 |
client.get_dict()["organic_results"]]
|
88 |
+
except Exception:
|
89 |
return Google.be_output("**ERROR**: Existing Unavailable Parameters!")
|
90 |
|
91 |
if not google_res:
|
92 |
return Google.be_output("")
|
93 |
|
94 |
df = pd.DataFrame(google_res)
|
95 |
+
logger.debug(f"df: {df}")
|
96 |
return df
|
agent/component/googlescholar.py
CHANGED
@@ -15,9 +15,9 @@
|
|
15 |
#
|
16 |
from abc import ABC
|
17 |
import pandas as pd
|
18 |
-
from agent.settings import DEBUG
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
from scholarly import scholarly
|
|
|
21 |
|
22 |
|
23 |
class GoogleScholarParam(ComponentParamBase):
|
@@ -58,13 +58,13 @@ class GoogleScholar(ComponentBase, ABC):
|
|
58 |
'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
|
59 |
'bib'].get('abstract', 'no abstract')})
|
60 |
|
61 |
-
except StopIteration or Exception
|
62 |
-
|
63 |
break
|
64 |
|
65 |
if not scholar_res:
|
66 |
return GoogleScholar.be_output("")
|
67 |
|
68 |
df = pd.DataFrame(scholar_res)
|
69 |
-
|
70 |
return df
|
|
|
15 |
#
|
16 |
from abc import ABC
|
17 |
import pandas as pd
|
|
|
18 |
from agent.component.base import ComponentBase, ComponentParamBase
|
19 |
from scholarly import scholarly
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class GoogleScholarParam(ComponentParamBase):
|
|
|
58 |
'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
|
59 |
'bib'].get('abstract', 'no abstract')})
|
60 |
|
61 |
+
except StopIteration or Exception:
|
62 |
+
logger.exception("GoogleScholar")
|
63 |
break
|
64 |
|
65 |
if not scholar_res:
|
66 |
return GoogleScholar.be_output("")
|
67 |
|
68 |
df = pd.DataFrame(scholar_res)
|
69 |
+
logger.debug(f"df: {df}")
|
70 |
return df
|
agent/component/keyword.py
CHANGED
@@ -18,7 +18,7 @@ from abc import ABC
|
|
18 |
from api.db import LLMType
|
19 |
from api.db.services.llm_service import LLMBundle
|
20 |
from agent.component import GenerateParam, Generate
|
21 |
-
from
|
22 |
|
23 |
|
24 |
class KeywordExtractParam(GenerateParam):
|
@@ -58,5 +58,5 @@ class KeywordExtract(Generate, ABC):
|
|
58 |
self._param.gen_conf())
|
59 |
|
60 |
ans = re.sub(r".*keyword:", "", ans).strip()
|
61 |
-
|
62 |
return KeywordExtract.be_output(ans)
|
|
|
18 |
from api.db import LLMType
|
19 |
from api.db.services.llm_service import LLMBundle
|
20 |
from agent.component import GenerateParam, Generate
|
21 |
+
from api.utils.log_utils import logger
|
22 |
|
23 |
|
24 |
class KeywordExtractParam(GenerateParam):
|
|
|
58 |
self._param.gen_conf())
|
59 |
|
60 |
ans = re.sub(r".*keyword:", "", ans).strip()
|
61 |
+
logger.info(f"ans: {ans}")
|
62 |
return KeywordExtract.be_output(ans)
|
agent/component/pubmed.py
CHANGED
@@ -18,8 +18,8 @@ from Bio import Entrez
|
|
18 |
import re
|
19 |
import pandas as pd
|
20 |
import xml.etree.ElementTree as ET
|
21 |
-
from agent.settings import DEBUG
|
22 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
23 |
|
24 |
|
25 |
class PubMedParam(ComponentParamBase):
|
@@ -65,5 +65,5 @@ class PubMed(ComponentBase, ABC):
|
|
65 |
return PubMed.be_output("")
|
66 |
|
67 |
df = pd.DataFrame(pubmed_res)
|
68 |
-
|
69 |
return df
|
|
|
18 |
import re
|
19 |
import pandas as pd
|
20 |
import xml.etree.ElementTree as ET
|
|
|
21 |
from agent.component.base import ComponentBase, ComponentParamBase
|
22 |
+
from api.utils.log_utils import logger
|
23 |
|
24 |
|
25 |
class PubMedParam(ComponentParamBase):
|
|
|
65 |
return PubMed.be_output("")
|
66 |
|
67 |
df = pd.DataFrame(pubmed_res)
|
68 |
+
logger.debug(f"df: {df}")
|
69 |
return df
|
agent/component/relevant.py
CHANGED
@@ -18,6 +18,7 @@ from api.db import LLMType
|
|
18 |
from api.db.services.llm_service import LLMBundle
|
19 |
from agent.component import GenerateParam, Generate
|
20 |
from rag.utils import num_tokens_from_string, encoder
|
|
|
21 |
|
22 |
|
23 |
class RelevantParam(GenerateParam):
|
@@ -70,7 +71,7 @@ class Relevant(Generate, ABC):
|
|
70 |
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": ans}],
|
71 |
self._param.gen_conf())
|
72 |
|
73 |
-
|
74 |
if ans.lower().find("yes") >= 0:
|
75 |
return Relevant.be_output(self._param.yes)
|
76 |
if ans.lower().find("no") >= 0:
|
|
|
18 |
from api.db.services.llm_service import LLMBundle
|
19 |
from agent.component import GenerateParam, Generate
|
20 |
from rag.utils import num_tokens_from_string, encoder
|
21 |
+
from api.utils.log_utils import logger
|
22 |
|
23 |
|
24 |
class RelevantParam(GenerateParam):
|
|
|
71 |
ans = chat_mdl.chat(self._param.get_prompt(), [{"role": "user", "content": ans}],
|
72 |
self._param.gen_conf())
|
73 |
|
74 |
+
logger.info(ans)
|
75 |
if ans.lower().find("yes") >= 0:
|
76 |
return Relevant.be_output(self._param.yes)
|
77 |
if ans.lower().find("no") >= 0:
|
agent/component/retrieval.py
CHANGED
@@ -22,6 +22,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
22 |
from api.db.services.llm_service import LLMBundle
|
23 |
from api.settings import retrievaler
|
24 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
25 |
|
26 |
|
27 |
class RetrievalParam(ComponentParamBase):
|
@@ -80,7 +81,7 @@ class Retrieval(ComponentBase, ABC):
|
|
80 |
df = pd.DataFrame(kbinfos["chunks"])
|
81 |
df["content"] = df["content_with_weight"]
|
82 |
del df["content_with_weight"]
|
83 |
-
|
84 |
return df
|
85 |
|
86 |
|
|
|
22 |
from api.db.services.llm_service import LLMBundle
|
23 |
from api.settings import retrievaler
|
24 |
from agent.component.base import ComponentBase, ComponentParamBase
|
25 |
+
from api.utils.log_utils import logger
|
26 |
|
27 |
|
28 |
class RetrievalParam(ComponentParamBase):
|
|
|
81 |
df = pd.DataFrame(kbinfos["chunks"])
|
82 |
df["content"] = df["content_with_weight"]
|
83 |
del df["content_with_weight"]
|
84 |
+
logger.debug("{} {}".format(query, df))
|
85 |
return df
|
86 |
|
87 |
|
agent/component/rewrite.py
CHANGED
@@ -17,6 +17,7 @@ from abc import ABC
|
|
17 |
from api.db import LLMType
|
18 |
from api.db.services.llm_service import LLMBundle
|
19 |
from agent.component import GenerateParam, Generate
|
|
|
20 |
|
21 |
|
22 |
class RewriteQuestionParam(GenerateParam):
|
@@ -104,7 +105,7 @@ class RewriteQuestion(Generate, ABC):
|
|
104 |
self._canvas.history.pop()
|
105 |
self._canvas.history.append(("user", ans))
|
106 |
|
107 |
-
|
108 |
return RewriteQuestion.be_output(ans)
|
109 |
|
110 |
|
|
|
17 |
from api.db import LLMType
|
18 |
from api.db.services.llm_service import LLMBundle
|
19 |
from agent.component import GenerateParam, Generate
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class RewriteQuestionParam(GenerateParam):
|
|
|
105 |
self._canvas.history.pop()
|
106 |
self._canvas.history.append(("user", ans))
|
107 |
|
108 |
+
logger.info(ans)
|
109 |
return RewriteQuestion.be_output(ans)
|
110 |
|
111 |
|
agent/component/wikipedia.py
CHANGED
@@ -13,13 +13,11 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
-
import random
|
17 |
from abc import ABC
|
18 |
-
from functools import partial
|
19 |
import wikipedia
|
20 |
import pandas as pd
|
21 |
-
from agent.settings import DEBUG
|
22 |
from agent.component.base import ComponentBase, ComponentParamBase
|
|
|
23 |
|
24 |
|
25 |
class WikipediaParam(ComponentParamBase):
|
@@ -65,5 +63,5 @@ class Wikipedia(ComponentBase, ABC):
|
|
65 |
return Wikipedia.be_output("")
|
66 |
|
67 |
df = pd.DataFrame(wiki_res)
|
68 |
-
|
69 |
return df
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
16 |
from abc import ABC
|
|
|
17 |
import wikipedia
|
18 |
import pandas as pd
|
|
|
19 |
from agent.component.base import ComponentBase, ComponentParamBase
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class WikipediaParam(ComponentParamBase):
|
|
|
63 |
return Wikipedia.be_output("")
|
64 |
|
65 |
df = pd.DataFrame(wiki_res)
|
66 |
+
logger.debug(f"df: {df}")
|
67 |
return df
|
agent/component/yahoofinance.py
CHANGED
@@ -17,6 +17,7 @@ from abc import ABC
|
|
17 |
import pandas as pd
|
18 |
from agent.component.base import ComponentBase, ComponentParamBase
|
19 |
import yfinance as yf
|
|
|
20 |
|
21 |
|
22 |
class YahooFinanceParam(ComponentParamBase):
|
@@ -74,8 +75,8 @@ class YahooFinance(ComponentBase, ABC):
|
|
74 |
{"content": "quarterly cash flow statement:\n" + msft.quarterly_cashflow.to_markdown() + "\n"})
|
75 |
if self._param.news:
|
76 |
yohoo_res.append({"content": "news:\n" + pd.DataFrame(msft.news).to_markdown() + "\n"})
|
77 |
-
except Exception
|
78 |
-
|
79 |
|
80 |
if not yohoo_res:
|
81 |
return YahooFinance.be_output("")
|
|
|
17 |
import pandas as pd
|
18 |
from agent.component.base import ComponentBase, ComponentParamBase
|
19 |
import yfinance as yf
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
|
23 |
class YahooFinanceParam(ComponentParamBase):
|
|
|
75 |
{"content": "quarterly cash flow statement:\n" + msft.quarterly_cashflow.to_markdown() + "\n"})
|
76 |
if self._param.news:
|
77 |
yohoo_res.append({"content": "news:\n" + pd.DataFrame(msft.news).to_markdown() + "\n"})
|
78 |
+
except Exception:
|
79 |
+
logger.exception("YahooFinance got exception")
|
80 |
|
81 |
if not yohoo_res:
|
82 |
return YahooFinance.be_output("")
|
agent/settings.py
CHANGED
@@ -13,22 +13,6 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
-
# Logger
|
17 |
-
import os
|
18 |
|
19 |
-
from api.utils.file_utils import get_project_base_directory
|
20 |
-
from api.utils.log_utils import LoggerFactory, getLogger
|
21 |
-
|
22 |
-
DEBUG = 0
|
23 |
-
LoggerFactory.set_directory(
|
24 |
-
os.path.join(
|
25 |
-
get_project_base_directory(),
|
26 |
-
"logs",
|
27 |
-
"flow"))
|
28 |
-
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
|
29 |
-
LoggerFactory.LEVEL = 30
|
30 |
-
|
31 |
-
flow_logger = getLogger("flow")
|
32 |
-
database_logger = getLogger("database")
|
33 |
FLOAT_ZERO = 1e-8
|
34 |
PARAM_MAXDEPTH = 5
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
|
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
FLOAT_ZERO = 1e-8
|
18 |
PARAM_MAXDEPTH = 5
|
api/apps/__init__.py
CHANGED
@@ -13,7 +13,6 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
-
import logging
|
17 |
import os
|
18 |
import sys
|
19 |
from importlib.util import module_from_spec, spec_from_file_location
|
@@ -30,18 +29,14 @@ from api.utils import CustomJSONEncoder, commands
|
|
30 |
|
31 |
from flask_session import Session
|
32 |
from flask_login import LoginManager
|
33 |
-
from api.settings import SECRET_KEY
|
34 |
-
from api.settings import API_VERSION
|
35 |
from api.utils.api_utils import server_error_response
|
|
|
36 |
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
37 |
|
38 |
__all__ = ["app"]
|
39 |
|
40 |
-
|
41 |
-
logger = logging.getLogger("flask.app")
|
42 |
-
for h in access_logger.handlers:
|
43 |
-
logger.addHandler(h)
|
44 |
-
|
45 |
Request.json = property(lambda self: self.get_json(force=True, silent=True))
|
46 |
|
47 |
app = Flask(__name__)
|
@@ -158,8 +153,8 @@ def load_user(web_request):
|
|
158 |
return user[0]
|
159 |
else:
|
160 |
return None
|
161 |
-
except Exception
|
162 |
-
|
163 |
return None
|
164 |
else:
|
165 |
return None
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
16 |
import os
|
17 |
import sys
|
18 |
from importlib.util import module_from_spec, spec_from_file_location
|
|
|
29 |
|
30 |
from flask_session import Session
|
31 |
from flask_login import LoginManager
|
32 |
+
from api.settings import SECRET_KEY
|
33 |
+
from api.settings import API_VERSION
|
34 |
from api.utils.api_utils import server_error_response
|
35 |
+
from api.utils.log_utils import logger
|
36 |
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
37 |
|
38 |
__all__ = ["app"]
|
39 |
|
|
|
|
|
|
|
|
|
|
|
40 |
Request.json = property(lambda self: self.get_json(force=True, silent=True))
|
41 |
|
42 |
app = Flask(__name__)
|
|
|
153 |
return user[0]
|
154 |
else:
|
155 |
return None
|
156 |
+
except Exception:
|
157 |
+
logger.exception("load_user got exception")
|
158 |
return None
|
159 |
else:
|
160 |
return None
|
api/apps/canvas_app.py
CHANGED
@@ -23,6 +23,7 @@ from api.utils import get_uuid
|
|
23 |
from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result
|
24 |
from agent.canvas import Canvas
|
25 |
from peewee import MySQLDatabase, PostgresqlDatabase
|
|
|
26 |
|
27 |
|
28 |
@manager.route('/templates', methods=['GET'])
|
@@ -114,7 +115,7 @@ def run():
|
|
114 |
pass
|
115 |
canvas.add_user_input(req["message"])
|
116 |
answer = canvas.run(stream=stream)
|
117 |
-
|
118 |
except Exception as e:
|
119 |
return server_error_response(e)
|
120 |
|
|
|
23 |
from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result
|
24 |
from agent.canvas import Canvas
|
25 |
from peewee import MySQLDatabase, PostgresqlDatabase
|
26 |
+
from api.utils.log_utils import logger
|
27 |
|
28 |
|
29 |
@manager.route('/templates', methods=['GET'])
|
|
|
115 |
pass
|
116 |
canvas.add_user_input(req["message"])
|
117 |
answer = canvas.run(stream=stream)
|
118 |
+
logger.info(canvas)
|
119 |
except Exception as e:
|
120 |
return server_error_response(e)
|
121 |
|
api/apps/llm_app.py
CHANGED
@@ -25,6 +25,7 @@ from api.db.db_models import TenantLLM
|
|
25 |
from api.utils.api_utils import get_json_result
|
26 |
from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel
|
27 |
import requests
|
|
|
28 |
|
29 |
|
30 |
@manager.route('/factories', methods=['GET'])
|
@@ -89,7 +90,7 @@ def set_api_key():
|
|
89 |
if len(arr) == 0 or tc == 0:
|
90 |
raise Exception("Fail")
|
91 |
rerank_passed = True
|
92 |
-
|
93 |
except Exception as e:
|
94 |
msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(
|
95 |
e)
|
|
|
25 |
from api.utils.api_utils import get_json_result
|
26 |
from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel
|
27 |
import requests
|
28 |
+
from api.utils.log_utils import logger
|
29 |
|
30 |
|
31 |
@manager.route('/factories', methods=['GET'])
|
|
|
90 |
if len(arr) == 0 or tc == 0:
|
91 |
raise Exception("Fail")
|
92 |
rerank_passed = True
|
93 |
+
logger.info(f'passed model rerank {llm.llm_name}')
|
94 |
except Exception as e:
|
95 |
msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(
|
96 |
e)
|
api/apps/sdk/dataset.py
CHANGED
@@ -526,4 +526,4 @@ def list(tenant_id):
|
|
526 |
new_key = key_mapping.get(key, key)
|
527 |
renamed_data[new_key] = value
|
528 |
renamed_list.append(renamed_data)
|
529 |
-
return get_result(data=renamed_list)
|
|
|
526 |
new_key = key_mapping.get(key, key)
|
527 |
renamed_data[new_key] = value
|
528 |
renamed_list.append(renamed_data)
|
529 |
+
return get_result(data=renamed_list)
|
api/apps/user_app.py
CHANGED
@@ -53,8 +53,8 @@ from api.settings import (
|
|
53 |
)
|
54 |
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
55 |
from api.db.services.file_service import FileService
|
56 |
-
from api.settings import stat_logger
|
57 |
from api.utils.api_utils import get_json_result, construct_response
|
|
|
58 |
|
59 |
|
60 |
@manager.route("/login", methods=["POST", "GET"])
|
@@ -177,7 +177,7 @@ def github_callback():
|
|
177 |
try:
|
178 |
avatar = download_img(user_info["avatar_url"])
|
179 |
except Exception as e:
|
180 |
-
|
181 |
avatar = ""
|
182 |
users = user_register(
|
183 |
user_id,
|
@@ -202,7 +202,7 @@ def github_callback():
|
|
202 |
return redirect("/?auth=%s" % user.get_id())
|
203 |
except Exception as e:
|
204 |
rollback_user_registration(user_id)
|
205 |
-
|
206 |
return redirect("/?error=%s" % str(e))
|
207 |
|
208 |
# User has already registered, try to log in
|
@@ -279,7 +279,7 @@ def feishu_callback():
|
|
279 |
try:
|
280 |
avatar = download_img(user_info["avatar_url"])
|
281 |
except Exception as e:
|
282 |
-
|
283 |
avatar = ""
|
284 |
users = user_register(
|
285 |
user_id,
|
@@ -304,7 +304,7 @@ def feishu_callback():
|
|
304 |
return redirect("/?auth=%s" % user.get_id())
|
305 |
except Exception as e:
|
306 |
rollback_user_registration(user_id)
|
307 |
-
|
308 |
return redirect("/?error=%s" % str(e))
|
309 |
|
310 |
# User has already registered, try to log in
|
@@ -436,7 +436,7 @@ def setting_user():
|
|
436 |
UserService.update_by_id(current_user.id, update_dict)
|
437 |
return get_json_result(data=True)
|
438 |
except Exception as e:
|
439 |
-
|
440 |
return get_json_result(
|
441 |
data=False, message="Update failure!", code=RetCode.EXCEPTION_ERROR
|
442 |
)
|
@@ -621,7 +621,7 @@ def user_add():
|
|
621 |
)
|
622 |
except Exception as e:
|
623 |
rollback_user_registration(user_id)
|
624 |
-
|
625 |
return get_json_result(
|
626 |
data=False,
|
627 |
message=f"User registration failure, error: {str(e)}",
|
|
|
53 |
)
|
54 |
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
55 |
from api.db.services.file_service import FileService
|
|
|
56 |
from api.utils.api_utils import get_json_result, construct_response
|
57 |
+
from api.utils.log_utils import logger
|
58 |
|
59 |
|
60 |
@manager.route("/login", methods=["POST", "GET"])
|
|
|
177 |
try:
|
178 |
avatar = download_img(user_info["avatar_url"])
|
179 |
except Exception as e:
|
180 |
+
logger.exception(e)
|
181 |
avatar = ""
|
182 |
users = user_register(
|
183 |
user_id,
|
|
|
202 |
return redirect("/?auth=%s" % user.get_id())
|
203 |
except Exception as e:
|
204 |
rollback_user_registration(user_id)
|
205 |
+
logger.exception(e)
|
206 |
return redirect("/?error=%s" % str(e))
|
207 |
|
208 |
# User has already registered, try to log in
|
|
|
279 |
try:
|
280 |
avatar = download_img(user_info["avatar_url"])
|
281 |
except Exception as e:
|
282 |
+
logger.exception(e)
|
283 |
avatar = ""
|
284 |
users = user_register(
|
285 |
user_id,
|
|
|
304 |
return redirect("/?auth=%s" % user.get_id())
|
305 |
except Exception as e:
|
306 |
rollback_user_registration(user_id)
|
307 |
+
logger.exception(e)
|
308 |
return redirect("/?error=%s" % str(e))
|
309 |
|
310 |
# User has already registered, try to log in
|
|
|
436 |
UserService.update_by_id(current_user.id, update_dict)
|
437 |
return get_json_result(data=True)
|
438 |
except Exception as e:
|
439 |
+
logger.exception(e)
|
440 |
return get_json_result(
|
441 |
data=False, message="Update failure!", code=RetCode.EXCEPTION_ERROR
|
442 |
)
|
|
|
621 |
)
|
622 |
except Exception as e:
|
623 |
rollback_user_registration(user_id)
|
624 |
+
logger.exception(e)
|
625 |
return get_json_result(
|
626 |
data=False,
|
627 |
message=f"User registration failure, error: {str(e)}",
|
api/db/db_models.py
CHANGED
@@ -30,12 +30,9 @@ from peewee import (
|
|
30 |
)
|
31 |
from playhouse.pool import PooledMySQLDatabase, PooledPostgresqlDatabase
|
32 |
from api.db import SerializedType, ParserType
|
33 |
-
from api.settings import DATABASE,
|
34 |
-
from api.utils.log_utils import getLogger
|
35 |
from api import utils
|
36 |
-
|
37 |
-
LOGGER = getLogger()
|
38 |
-
|
39 |
|
40 |
def singleton(cls, *args, **kw):
|
41 |
instances = {}
|
@@ -288,7 +285,7 @@ class BaseDataBase:
|
|
288 |
database_config = DATABASE.copy()
|
289 |
db_name = database_config.pop("name")
|
290 |
self.database_connection = PooledDatabase[DATABASE_TYPE.upper()].value(db_name, **database_config)
|
291 |
-
|
292 |
|
293 |
class PostgresDatabaseLock:
|
294 |
def __init__(self, lock_name, timeout=10, db=None):
|
@@ -396,7 +393,7 @@ def close_connection():
|
|
396 |
if DB:
|
397 |
DB.close_stale(age=30)
|
398 |
except Exception as e:
|
399 |
-
|
400 |
|
401 |
|
402 |
class DataBaseModel(BaseModel):
|
@@ -412,15 +409,15 @@ def init_database_tables(alter_fields=[]):
|
|
412 |
for name, obj in members:
|
413 |
if obj != DataBaseModel and issubclass(obj, DataBaseModel):
|
414 |
table_objs.append(obj)
|
415 |
-
|
416 |
try:
|
417 |
obj.create_table()
|
418 |
-
|
419 |
except Exception as e:
|
420 |
-
|
421 |
create_failed_list.append(obj.__name__)
|
422 |
if create_failed_list:
|
423 |
-
|
424 |
raise Exception(f"create tables failed: {create_failed_list}")
|
425 |
migrate_db()
|
426 |
|
|
|
30 |
)
|
31 |
from playhouse.pool import PooledMySQLDatabase, PooledPostgresqlDatabase
|
32 |
from api.db import SerializedType, ParserType
|
33 |
+
from api.settings import DATABASE, SECRET_KEY, DATABASE_TYPE
|
|
|
34 |
from api import utils
|
35 |
+
from api.utils.log_utils import logger
|
|
|
|
|
36 |
|
37 |
def singleton(cls, *args, **kw):
|
38 |
instances = {}
|
|
|
285 |
database_config = DATABASE.copy()
|
286 |
db_name = database_config.pop("name")
|
287 |
self.database_connection = PooledDatabase[DATABASE_TYPE.upper()].value(db_name, **database_config)
|
288 |
+
logger.info('init database on cluster mode successfully')
|
289 |
|
290 |
class PostgresDatabaseLock:
|
291 |
def __init__(self, lock_name, timeout=10, db=None):
|
|
|
393 |
if DB:
|
394 |
DB.close_stale(age=30)
|
395 |
except Exception as e:
|
396 |
+
logger.exception(e)
|
397 |
|
398 |
|
399 |
class DataBaseModel(BaseModel):
|
|
|
409 |
for name, obj in members:
|
410 |
if obj != DataBaseModel and issubclass(obj, DataBaseModel):
|
411 |
table_objs.append(obj)
|
412 |
+
logger.info(f"start create table {obj.__name__}")
|
413 |
try:
|
414 |
obj.create_table()
|
415 |
+
logger.info(f"create table success: {obj.__name__}")
|
416 |
except Exception as e:
|
417 |
+
logger.exception(e)
|
418 |
create_failed_list.append(obj.__name__)
|
419 |
if create_failed_list:
|
420 |
+
logger.info(f"create tables failed: {create_failed_list}")
|
421 |
raise Exception(f"create tables failed: {create_failed_list}")
|
422 |
migrate_db()
|
423 |
|
api/db/db_utils.py
CHANGED
@@ -22,12 +22,6 @@ from playhouse.pool import PooledMySQLDatabase
|
|
22 |
from api.utils import current_timestamp, timestamp_to_date
|
23 |
|
24 |
from api.db.db_models import DB, DataBaseModel
|
25 |
-
from api.db.runtime_config import RuntimeConfig
|
26 |
-
from api.utils.log_utils import getLogger
|
27 |
-
from enum import Enum
|
28 |
-
|
29 |
-
|
30 |
-
LOGGER = getLogger()
|
31 |
|
32 |
|
33 |
@DB.connection_context()
|
|
|
22 |
from api.utils import current_timestamp, timestamp_to_date
|
23 |
|
24 |
from api.db.db_models import DB, DataBaseModel
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
27 |
@DB.connection_context()
|
api/db/init_data.py
CHANGED
@@ -30,6 +30,7 @@ from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantL
|
|
30 |
from api.db.services.user_service import TenantService, UserTenantService
|
31 |
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
|
32 |
from api.utils.file_utils import get_project_base_directory
|
|
|
33 |
|
34 |
|
35 |
def encode_to_base64(input_string):
|
@@ -69,36 +70,34 @@ def init_superuser():
|
|
69 |
"api_key": API_KEY, "api_base": LLM_BASE_URL})
|
70 |
|
71 |
if not UserService.save(**user_info):
|
72 |
-
|
73 |
return
|
74 |
TenantService.insert(**tenant)
|
75 |
UserTenantService.insert(**usr_tenant)
|
76 |
TenantLLMService.insert_many(tenant_llm)
|
77 |
-
|
78 |
-
"
|
79 |
|
80 |
chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"])
|
81 |
msg = chat_mdl.chat(system="", history=[
|
82 |
{"role": "user", "content": "Hello!"}], gen_conf={})
|
83 |
if msg.find("ERROR: ") == 0:
|
84 |
-
|
85 |
-
"\33[91m【ERROR】\33[0m: ",
|
86 |
"'{}' dosen't work. {}".format(
|
87 |
tenant["llm_id"],
|
88 |
msg))
|
89 |
embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"])
|
90 |
v, c = embd_mdl.encode(["Hello!"])
|
91 |
if c == 0:
|
92 |
-
|
93 |
-
"
|
94 |
-
" '{}' dosen't work!".format(
|
95 |
tenant["embd_id"]))
|
96 |
|
97 |
|
98 |
def init_llm_factory():
|
99 |
try:
|
100 |
LLMService.filter_delete([(LLM.fid == "MiniMax" or LLM.fid == "Minimax")])
|
101 |
-
except Exception
|
102 |
pass
|
103 |
|
104 |
factory_llm_infos = json.load(
|
@@ -111,14 +110,14 @@ def init_llm_factory():
|
|
111 |
llm_infos = factory_llm_info.pop("llm")
|
112 |
try:
|
113 |
LLMFactoriesService.save(**factory_llm_info)
|
114 |
-
except Exception
|
115 |
pass
|
116 |
LLMService.filter_delete([LLM.fid == factory_llm_info["name"]])
|
117 |
for llm_info in llm_infos:
|
118 |
llm_info["fid"] = factory_llm_info["name"]
|
119 |
try:
|
120 |
LLMService.save(**llm_info)
|
121 |
-
except Exception
|
122 |
pass
|
123 |
|
124 |
LLMFactoriesService.filter_delete([LLMFactories.name == "Local"])
|
@@ -145,7 +144,7 @@ def init_llm_factory():
|
|
145 |
row = deepcopy(row)
|
146 |
row["llm_name"] = "text-embedding-3-large"
|
147 |
TenantLLMService.save(**row)
|
148 |
-
except Exception
|
149 |
pass
|
150 |
break
|
151 |
for kb_id in KnowledgebaseService.get_all_ids():
|
@@ -169,9 +168,8 @@ def add_graph_templates():
|
|
169 |
CanvasTemplateService.save(**cnvs)
|
170 |
except:
|
171 |
CanvasTemplateService.update_by_id(cnvs["id"], cnvs)
|
172 |
-
except Exception
|
173 |
-
|
174 |
-
print("------------", flush=True)
|
175 |
|
176 |
|
177 |
def init_web_data():
|
@@ -182,7 +180,7 @@ def init_web_data():
|
|
182 |
# init_superuser()
|
183 |
|
184 |
add_graph_templates()
|
185 |
-
|
186 |
|
187 |
|
188 |
if __name__ == '__main__':
|
|
|
30 |
from api.db.services.user_service import TenantService, UserTenantService
|
31 |
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
|
32 |
from api.utils.file_utils import get_project_base_directory
|
33 |
+
from api.utils.log_utils import logger
|
34 |
|
35 |
|
36 |
def encode_to_base64(input_string):
|
|
|
70 |
"api_key": API_KEY, "api_base": LLM_BASE_URL})
|
71 |
|
72 |
if not UserService.save(**user_info):
|
73 |
+
logger.info("can't init admin.")
|
74 |
return
|
75 |
TenantService.insert(**tenant)
|
76 |
UserTenantService.insert(**usr_tenant)
|
77 |
TenantLLMService.insert_many(tenant_llm)
|
78 |
+
logger.info(
|
79 |
+
"Super user initialized. email: [email protected], password: admin. Changing the password after logining is strongly recomanded.")
|
80 |
|
81 |
chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"])
|
82 |
msg = chat_mdl.chat(system="", history=[
|
83 |
{"role": "user", "content": "Hello!"}], gen_conf={})
|
84 |
if msg.find("ERROR: ") == 0:
|
85 |
+
logger.error(
|
|
|
86 |
"'{}' dosen't work. {}".format(
|
87 |
tenant["llm_id"],
|
88 |
msg))
|
89 |
embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"])
|
90 |
v, c = embd_mdl.encode(["Hello!"])
|
91 |
if c == 0:
|
92 |
+
logger.error(
|
93 |
+
"'{}' dosen't work!".format(
|
|
|
94 |
tenant["embd_id"]))
|
95 |
|
96 |
|
97 |
def init_llm_factory():
|
98 |
try:
|
99 |
LLMService.filter_delete([(LLM.fid == "MiniMax" or LLM.fid == "Minimax")])
|
100 |
+
except Exception:
|
101 |
pass
|
102 |
|
103 |
factory_llm_infos = json.load(
|
|
|
110 |
llm_infos = factory_llm_info.pop("llm")
|
111 |
try:
|
112 |
LLMFactoriesService.save(**factory_llm_info)
|
113 |
+
except Exception:
|
114 |
pass
|
115 |
LLMService.filter_delete([LLM.fid == factory_llm_info["name"]])
|
116 |
for llm_info in llm_infos:
|
117 |
llm_info["fid"] = factory_llm_info["name"]
|
118 |
try:
|
119 |
LLMService.save(**llm_info)
|
120 |
+
except Exception:
|
121 |
pass
|
122 |
|
123 |
LLMFactoriesService.filter_delete([LLMFactories.name == "Local"])
|
|
|
144 |
row = deepcopy(row)
|
145 |
row["llm_name"] = "text-embedding-3-large"
|
146 |
TenantLLMService.save(**row)
|
147 |
+
except Exception:
|
148 |
pass
|
149 |
break
|
150 |
for kb_id in KnowledgebaseService.get_all_ids():
|
|
|
168 |
CanvasTemplateService.save(**cnvs)
|
169 |
except:
|
170 |
CanvasTemplateService.update_by_id(cnvs["id"], cnvs)
|
171 |
+
except Exception:
|
172 |
+
logger.exception("Add graph templates error: ")
|
|
|
173 |
|
174 |
|
175 |
def init_web_data():
|
|
|
180 |
# init_superuser()
|
181 |
|
182 |
add_graph_templates()
|
183 |
+
logger.info("init web data success:{}".format(time.time() - start_time))
|
184 |
|
185 |
|
186 |
if __name__ == '__main__':
|
api/db/operatioins.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
#
|
2 |
-
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
3 |
-
#
|
4 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
-
# you may not use this file except in compliance with the License.
|
6 |
-
# You may obtain a copy of the License at
|
7 |
-
#
|
8 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
-
#
|
10 |
-
# Unless required by applicable law or agreed to in writing, software
|
11 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
-
# See the License for the specific language governing permissions and
|
14 |
-
# limitations under the License.
|
15 |
-
#
|
16 |
-
|
17 |
-
import operator
|
18 |
-
import time
|
19 |
-
import typing
|
20 |
-
from api.utils.log_utils import sql_logger
|
21 |
-
import peewee
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/db/services/dialog_service.py
CHANGED
@@ -26,11 +26,12 @@ from api.db.db_models import Dialog, Conversation,DB
|
|
26 |
from api.db.services.common_service import CommonService
|
27 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
28 |
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
29 |
-
from api.settings import
|
30 |
from rag.app.resume import forbidden_select_fields4resume
|
31 |
from rag.nlp.search import index_name
|
32 |
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
33 |
from api.utils.file_utils import get_project_base_directory
|
|
|
34 |
|
35 |
|
36 |
class DialogService(CommonService):
|
@@ -177,7 +178,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
177 |
tts_mdl = LLMBundle(dialog.tenant_id, LLMType.TTS)
|
178 |
# try to use sql if field mapping is good to go
|
179 |
if field_map:
|
180 |
-
|
181 |
ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True))
|
182 |
if ans:
|
183 |
yield ans
|
@@ -219,7 +220,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
219 |
doc_ids=attachments,
|
220 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
221 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
222 |
-
|
223 |
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
224 |
retrieval_tm = timer()
|
225 |
|
@@ -291,7 +292,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
291 |
yield decorate_answer(answer)
|
292 |
else:
|
293 |
answer = chat_mdl.chat(prompt, msg[1:], gen_conf)
|
294 |
-
|
295 |
msg[-1]["content"], answer))
|
296 |
res = decorate_answer(answer)
|
297 |
res["audio_binary"] = tts(tts_mdl, answer)
|
@@ -319,8 +320,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
|
319 |
nonlocal sys_prompt, user_promt, question, tried_times
|
320 |
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {
|
321 |
"temperature": 0.06})
|
322 |
-
|
323 |
-
chat_logger.info(f"“{question}”==>{user_promt} get SQL: {sql}")
|
324 |
sql = re.sub(r"[\r\n]+", " ", sql.lower())
|
325 |
sql = re.sub(r".*select ", "select ", sql.lower())
|
326 |
sql = re.sub(r" +", " ", sql)
|
@@ -340,9 +340,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
|
340 |
flds.append(k)
|
341 |
sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:]
|
342 |
|
343 |
-
|
344 |
-
|
345 |
-
chat_logger.info(f"“{question}” get SQL(refined): {sql}")
|
346 |
tried_times += 1
|
347 |
return retrievaler.sql_retrieval(sql, format="json"), sql
|
348 |
|
@@ -371,10 +369,9 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
|
371 |
question, sql, tbl["error"]
|
372 |
)
|
373 |
tbl, sql = get_table()
|
374 |
-
|
375 |
|
376 |
-
|
377 |
-
print(tbl)
|
378 |
if tbl.get("error") or len(tbl["rows"]) == 0:
|
379 |
return None
|
380 |
|
@@ -404,7 +401,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
|
404 |
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
405 |
|
406 |
if not docid_idx or not docnm_idx:
|
407 |
-
|
408 |
return {
|
409 |
"answer": "\n".join([clmns, line, rows]),
|
410 |
"reference": {"chunks": [], "doc_aggs": []},
|
|
|
26 |
from api.db.services.common_service import CommonService
|
27 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
28 |
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
29 |
+
from api.settings import retrievaler, kg_retrievaler
|
30 |
from rag.app.resume import forbidden_select_fields4resume
|
31 |
from rag.nlp.search import index_name
|
32 |
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
33 |
from api.utils.file_utils import get_project_base_directory
|
34 |
+
from api.utils.log_utils import logger
|
35 |
|
36 |
|
37 |
class DialogService(CommonService):
|
|
|
178 |
tts_mdl = LLMBundle(dialog.tenant_id, LLMType.TTS)
|
179 |
# try to use sql if field mapping is good to go
|
180 |
if field_map:
|
181 |
+
logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
182 |
ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True))
|
183 |
if ans:
|
184 |
yield ans
|
|
|
220 |
doc_ids=attachments,
|
221 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
222 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
223 |
+
logger.info(
|
224 |
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
225 |
retrieval_tm = timer()
|
226 |
|
|
|
292 |
yield decorate_answer(answer)
|
293 |
else:
|
294 |
answer = chat_mdl.chat(prompt, msg[1:], gen_conf)
|
295 |
+
logger.info("User: {}|Assistant: {}".format(
|
296 |
msg[-1]["content"], answer))
|
297 |
res = decorate_answer(answer)
|
298 |
res["audio_binary"] = tts(tts_mdl, answer)
|
|
|
320 |
nonlocal sys_prompt, user_promt, question, tried_times
|
321 |
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {
|
322 |
"temperature": 0.06})
|
323 |
+
logger.info(f"{question} ==> {user_promt} get SQL: {sql}")
|
|
|
324 |
sql = re.sub(r"[\r\n]+", " ", sql.lower())
|
325 |
sql = re.sub(r".*select ", "select ", sql.lower())
|
326 |
sql = re.sub(r" +", " ", sql)
|
|
|
340 |
flds.append(k)
|
341 |
sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:]
|
342 |
|
343 |
+
logger.info(f"{question} get SQL(refined): {sql}")
|
|
|
|
|
344 |
tried_times += 1
|
345 |
return retrievaler.sql_retrieval(sql, format="json"), sql
|
346 |
|
|
|
369 |
question, sql, tbl["error"]
|
370 |
)
|
371 |
tbl, sql = get_table()
|
372 |
+
logger.info("TRY it again: {}".format(sql))
|
373 |
|
374 |
+
logger.info("GET table: {}".format(tbl))
|
|
|
375 |
if tbl.get("error") or len(tbl["rows"]) == 0:
|
376 |
return None
|
377 |
|
|
|
401 |
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
402 |
|
403 |
if not docid_idx or not docnm_idx:
|
404 |
+
logger.warning("SQL missing field: " + sql)
|
405 |
return {
|
406 |
"answer": "\n".join([clmns, line, rows]),
|
407 |
"reference": {"chunks": [], "doc_aggs": []},
|
api/db/services/document_service.py
CHANGED
@@ -17,7 +17,6 @@ import hashlib
|
|
17 |
import json
|
18 |
import random
|
19 |
import re
|
20 |
-
import traceback
|
21 |
from concurrent.futures import ThreadPoolExecutor
|
22 |
from copy import deepcopy
|
23 |
from datetime import datetime
|
@@ -26,7 +25,7 @@ from io import BytesIO
|
|
26 |
from peewee import fn
|
27 |
|
28 |
from api.db.db_utils import bulk_insert_into_db
|
29 |
-
from api.settings import
|
30 |
from api.utils import current_timestamp, get_format_time, get_uuid
|
31 |
from graphrag.mind_map_extractor import MindMapExtractor
|
32 |
from rag.settings import SVR_QUEUE_NAME
|
@@ -40,6 +39,7 @@ from api.db.services.common_service import CommonService
|
|
40 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
41 |
from api.db import StatusEnum
|
42 |
from rag.utils.redis_conn import REDIS_CONN
|
|
|
43 |
|
44 |
|
45 |
class DocumentService(CommonService):
|
@@ -387,7 +387,7 @@ class DocumentService(CommonService):
|
|
387 |
cls.update_by_id(d["id"], info)
|
388 |
except Exception as e:
|
389 |
if str(e).find("'0'") < 0:
|
390 |
-
|
391 |
|
392 |
@classmethod
|
393 |
@DB.connection_context()
|
@@ -544,7 +544,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|
544 |
"knowledge_graph_kwd": "mind_map"
|
545 |
})
|
546 |
except Exception as e:
|
547 |
-
|
548 |
|
549 |
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
|
550 |
assert len(cks) == len(vects)
|
|
|
17 |
import json
|
18 |
import random
|
19 |
import re
|
|
|
20 |
from concurrent.futures import ThreadPoolExecutor
|
21 |
from copy import deepcopy
|
22 |
from datetime import datetime
|
|
|
25 |
from peewee import fn
|
26 |
|
27 |
from api.db.db_utils import bulk_insert_into_db
|
28 |
+
from api.settings import docStoreConn
|
29 |
from api.utils import current_timestamp, get_format_time, get_uuid
|
30 |
from graphrag.mind_map_extractor import MindMapExtractor
|
31 |
from rag.settings import SVR_QUEUE_NAME
|
|
|
39 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
40 |
from api.db import StatusEnum
|
41 |
from rag.utils.redis_conn import REDIS_CONN
|
42 |
+
from api.utils.log_utils import logger
|
43 |
|
44 |
|
45 |
class DocumentService(CommonService):
|
|
|
387 |
cls.update_by_id(d["id"], info)
|
388 |
except Exception as e:
|
389 |
if str(e).find("'0'") < 0:
|
390 |
+
logger.exception("fetch task exception")
|
391 |
|
392 |
@classmethod
|
393 |
@DB.connection_context()
|
|
|
544 |
"knowledge_graph_kwd": "mind_map"
|
545 |
})
|
546 |
except Exception as e:
|
547 |
+
logger.exception("Mind map generation error")
|
548 |
|
549 |
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
|
550 |
assert len(cks) == len(vects)
|
api/db/services/file_service.py
CHANGED
@@ -28,6 +28,7 @@ from api.db.services.file2document_service import File2DocumentService
|
|
28 |
from api.utils import get_uuid
|
29 |
from api.utils.file_utils import filename_type, thumbnail_img
|
30 |
from rag.utils.storage_factory import STORAGE_IMPL
|
|
|
31 |
|
32 |
|
33 |
class FileService(CommonService):
|
@@ -272,8 +273,8 @@ class FileService(CommonService):
|
|
272 |
cls.delete_folder_by_pf_id(user_id, file.id)
|
273 |
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
274 |
& (cls.model.id == folder_id)).execute(),
|
275 |
-
except Exception
|
276 |
-
|
277 |
raise RuntimeError("Database error (File retrieval)!")
|
278 |
|
279 |
@classmethod
|
@@ -321,8 +322,8 @@ class FileService(CommonService):
|
|
321 |
def move_file(cls, file_ids, folder_id):
|
322 |
try:
|
323 |
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
|
324 |
-
except Exception
|
325 |
-
|
326 |
raise RuntimeError("Database error (File move)!")
|
327 |
|
328 |
@classmethod
|
|
|
28 |
from api.utils import get_uuid
|
29 |
from api.utils.file_utils import filename_type, thumbnail_img
|
30 |
from rag.utils.storage_factory import STORAGE_IMPL
|
31 |
+
from api.utils.log_utils import logger
|
32 |
|
33 |
|
34 |
class FileService(CommonService):
|
|
|
273 |
cls.delete_folder_by_pf_id(user_id, file.id)
|
274 |
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
275 |
& (cls.model.id == folder_id)).execute(),
|
276 |
+
except Exception:
|
277 |
+
logger.exception("delete_folder_by_pf_id")
|
278 |
raise RuntimeError("Database error (File retrieval)!")
|
279 |
|
280 |
@classmethod
|
|
|
322 |
def move_file(cls, file_ids, folder_id):
|
323 |
try:
|
324 |
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
|
325 |
+
except Exception:
|
326 |
+
logger.exception("move_file")
|
327 |
raise RuntimeError("Database error (File move)!")
|
328 |
|
329 |
@classmethod
|
api/db/services/llm_service.py
CHANGED
@@ -14,12 +14,12 @@
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
from api.db.services.user_service import TenantService
|
17 |
-
from api.settings import database_logger
|
18 |
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel
|
19 |
from api.db import LLMType
|
20 |
from api.db.db_models import DB
|
21 |
from api.db.db_models import LLMFactories, LLM, TenantLLM
|
22 |
from api.db.services.common_service import CommonService
|
|
|
23 |
|
24 |
|
25 |
class LLMFactoriesService(CommonService):
|
@@ -209,40 +209,40 @@ class LLMBundle(object):
|
|
209 |
emd, used_tokens = self.mdl.encode(texts, batch_size)
|
210 |
if not TenantLLMService.increase_usage(
|
211 |
self.tenant_id, self.llm_type, used_tokens):
|
212 |
-
|
213 |
-
"
|
214 |
return emd, used_tokens
|
215 |
|
216 |
def encode_queries(self, query: str):
|
217 |
emd, used_tokens = self.mdl.encode_queries(query)
|
218 |
if not TenantLLMService.increase_usage(
|
219 |
self.tenant_id, self.llm_type, used_tokens):
|
220 |
-
|
221 |
-
"
|
222 |
return emd, used_tokens
|
223 |
|
224 |
def similarity(self, query: str, texts: list):
|
225 |
sim, used_tokens = self.mdl.similarity(query, texts)
|
226 |
if not TenantLLMService.increase_usage(
|
227 |
self.tenant_id, self.llm_type, used_tokens):
|
228 |
-
|
229 |
-
"
|
230 |
return sim, used_tokens
|
231 |
|
232 |
def describe(self, image, max_tokens=300):
|
233 |
txt, used_tokens = self.mdl.describe(image, max_tokens)
|
234 |
if not TenantLLMService.increase_usage(
|
235 |
self.tenant_id, self.llm_type, used_tokens):
|
236 |
-
|
237 |
-
"
|
238 |
return txt
|
239 |
|
240 |
def transcription(self, audio):
|
241 |
txt, used_tokens = self.mdl.transcription(audio)
|
242 |
if not TenantLLMService.increase_usage(
|
243 |
self.tenant_id, self.llm_type, used_tokens):
|
244 |
-
|
245 |
-
"
|
246 |
return txt
|
247 |
|
248 |
def tts(self, text):
|
@@ -250,8 +250,8 @@ class LLMBundle(object):
|
|
250 |
if isinstance(chunk,int):
|
251 |
if not TenantLLMService.increase_usage(
|
252 |
self.tenant_id, self.llm_type, chunk, self.llm_name):
|
253 |
-
|
254 |
-
"
|
255 |
return
|
256 |
yield chunk
|
257 |
|
@@ -259,8 +259,8 @@ class LLMBundle(object):
|
|
259 |
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
|
260 |
if isinstance(txt, int) and not TenantLLMService.increase_usage(
|
261 |
self.tenant_id, self.llm_type, used_tokens, self.llm_name):
|
262 |
-
|
263 |
-
"
|
264 |
return txt
|
265 |
|
266 |
def chat_streamly(self, system, history, gen_conf):
|
@@ -268,7 +268,7 @@ class LLMBundle(object):
|
|
268 |
if isinstance(txt, int):
|
269 |
if not TenantLLMService.increase_usage(
|
270 |
self.tenant_id, self.llm_type, txt, self.llm_name):
|
271 |
-
|
272 |
-
"
|
273 |
return
|
274 |
yield txt
|
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
from api.db.services.user_service import TenantService
|
|
|
17 |
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel
|
18 |
from api.db import LLMType
|
19 |
from api.db.db_models import DB
|
20 |
from api.db.db_models import LLMFactories, LLM, TenantLLM
|
21 |
from api.db.services.common_service import CommonService
|
22 |
+
from api.utils.log_utils import logger
|
23 |
|
24 |
|
25 |
class LLMFactoriesService(CommonService):
|
|
|
209 |
emd, used_tokens = self.mdl.encode(texts, batch_size)
|
210 |
if not TenantLLMService.increase_usage(
|
211 |
self.tenant_id, self.llm_type, used_tokens):
|
212 |
+
logger.error(
|
213 |
+
"LLMBundle.encode can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
|
214 |
return emd, used_tokens
|
215 |
|
216 |
def encode_queries(self, query: str):
|
217 |
emd, used_tokens = self.mdl.encode_queries(query)
|
218 |
if not TenantLLMService.increase_usage(
|
219 |
self.tenant_id, self.llm_type, used_tokens):
|
220 |
+
logger.error(
|
221 |
+
"LLMBundle.encode_queries can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens))
|
222 |
return emd, used_tokens
|
223 |
|
224 |
def similarity(self, query: str, texts: list):
|
225 |
sim, used_tokens = self.mdl.similarity(query, texts)
|
226 |
if not TenantLLMService.increase_usage(
|
227 |
self.tenant_id, self.llm_type, used_tokens):
|
228 |
+
logger.error(
|
229 |
+
"LLMBundle.similarity can't update token usage for {}/RERANK used_tokens: {}".format(self.tenant_id, used_tokens))
|
230 |
return sim, used_tokens
|
231 |
|
232 |
def describe(self, image, max_tokens=300):
|
233 |
txt, used_tokens = self.mdl.describe(image, max_tokens)
|
234 |
if not TenantLLMService.increase_usage(
|
235 |
self.tenant_id, self.llm_type, used_tokens):
|
236 |
+
logger.error(
|
237 |
+
"LLMBundle.describe can't update token usage for {}/IMAGE2TEXT used_tokens: {}".format(self.tenant_id, used_tokens))
|
238 |
return txt
|
239 |
|
240 |
def transcription(self, audio):
|
241 |
txt, used_tokens = self.mdl.transcription(audio)
|
242 |
if not TenantLLMService.increase_usage(
|
243 |
self.tenant_id, self.llm_type, used_tokens):
|
244 |
+
logger.error(
|
245 |
+
"LLMBundle.transcription can't update token usage for {}/SEQUENCE2TXT used_tokens: {}".format(self.tenant_id, used_tokens))
|
246 |
return txt
|
247 |
|
248 |
def tts(self, text):
|
|
|
250 |
if isinstance(chunk,int):
|
251 |
if not TenantLLMService.increase_usage(
|
252 |
self.tenant_id, self.llm_type, chunk, self.llm_name):
|
253 |
+
logger.error(
|
254 |
+
"LLMBundle.tts can't update token usage for {}/TTS".format(self.tenant_id))
|
255 |
return
|
256 |
yield chunk
|
257 |
|
|
|
259 |
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
|
260 |
if isinstance(txt, int) and not TenantLLMService.increase_usage(
|
261 |
self.tenant_id, self.llm_type, used_tokens, self.llm_name):
|
262 |
+
logger.error(
|
263 |
+
"LLMBundle.chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.llm_name, used_tokens))
|
264 |
return txt
|
265 |
|
266 |
def chat_streamly(self, system, history, gen_conf):
|
|
|
268 |
if isinstance(txt, int):
|
269 |
if not TenantLLMService.increase_usage(
|
270 |
self.tenant_id, self.llm_type, txt, self.llm_name):
|
271 |
+
logger.error(
|
272 |
+
"LLMBundle.chat_streamly can't update token usage for {}/CHAT llm_name: {}, content: {}".format(self.tenant_id, self.llm_name, txt))
|
273 |
return
|
274 |
yield txt
|
api/ragflow_server.py
CHANGED
@@ -27,13 +27,10 @@ from api.apps import app
|
|
27 |
from api.db.runtime_config import RuntimeConfig
|
28 |
from api.db.services.document_service import DocumentService
|
29 |
from api.settings import (
|
30 |
-
HOST,
|
31 |
-
HTTP_PORT,
|
32 |
-
access_logger,
|
33 |
-
database_logger,
|
34 |
-
stat_logger,
|
35 |
)
|
36 |
from api import utils
|
|
|
37 |
|
38 |
from api.db.db_models import init_database_tables as init_web_db
|
39 |
from api.db.init_data import init_web_data
|
@@ -45,23 +42,22 @@ def update_progress():
|
|
45 |
time.sleep(3)
|
46 |
try:
|
47 |
DocumentService.update_progress()
|
48 |
-
except Exception
|
49 |
-
|
50 |
|
51 |
|
52 |
-
if __name__ ==
|
53 |
-
|
54 |
-
r"""
|
55 |
____ ___ ______ ______ __
|
56 |
/ __ \ / | / ____// ____// /____ _ __
|
57 |
/ /_/ // /| | / / __ / /_ / // __ \| | /| / /
|
58 |
/ _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
|
59 |
/_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
|
60 |
|
61 |
-
"""
|
62 |
-
|
|
|
63 |
)
|
64 |
-
stat_logger.info(f"project base: {utils.file_utils.get_project_base_directory()}")
|
65 |
|
66 |
# init db
|
67 |
init_web_db()
|
@@ -83,7 +79,7 @@ if __name__ == "__main__":
|
|
83 |
|
84 |
RuntimeConfig.DEBUG = args.debug
|
85 |
if RuntimeConfig.DEBUG:
|
86 |
-
|
87 |
|
88 |
RuntimeConfig.init_env()
|
89 |
RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
|
@@ -91,17 +87,17 @@ if __name__ == "__main__":
|
|
91 |
peewee_logger = logging.getLogger("peewee")
|
92 |
peewee_logger.propagate = False
|
93 |
# rag_arch.common.log.ROpenHandler
|
94 |
-
peewee_logger.addHandler(
|
95 |
-
peewee_logger.setLevel(
|
96 |
|
97 |
thr = ThreadPoolExecutor(max_workers=1)
|
98 |
thr.submit(update_progress)
|
99 |
|
100 |
# start http server
|
101 |
try:
|
102 |
-
|
103 |
werkzeug_logger = logging.getLogger("werkzeug")
|
104 |
-
for h in
|
105 |
werkzeug_logger.addHandler(h)
|
106 |
run_simple(
|
107 |
hostname=HOST,
|
|
|
27 |
from api.db.runtime_config import RuntimeConfig
|
28 |
from api.db.services.document_service import DocumentService
|
29 |
from api.settings import (
|
30 |
+
HOST, HTTP_PORT
|
|
|
|
|
|
|
|
|
31 |
)
|
32 |
from api import utils
|
33 |
+
from api.utils.log_utils import logger
|
34 |
|
35 |
from api.db.db_models import init_database_tables as init_web_db
|
36 |
from api.db.init_data import init_web_data
|
|
|
42 |
time.sleep(3)
|
43 |
try:
|
44 |
DocumentService.update_progress()
|
45 |
+
except Exception:
|
46 |
+
logger.exception("update_progress exception")
|
47 |
|
48 |
|
49 |
+
if __name__ == '__main__':
|
50 |
+
logger.info(r"""
|
|
|
51 |
____ ___ ______ ______ __
|
52 |
/ __ \ / | / ____// ____// /____ _ __
|
53 |
/ /_/ // /| | / / __ / /_ / // __ \| | /| / /
|
54 |
/ _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
|
55 |
/_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
|
56 |
|
57 |
+
""")
|
58 |
+
logger.info(
|
59 |
+
f'project base: {utils.file_utils.get_project_base_directory()}'
|
60 |
)
|
|
|
61 |
|
62 |
# init db
|
63 |
init_web_db()
|
|
|
79 |
|
80 |
RuntimeConfig.DEBUG = args.debug
|
81 |
if RuntimeConfig.DEBUG:
|
82 |
+
logger.info("run on debug mode")
|
83 |
|
84 |
RuntimeConfig.init_env()
|
85 |
RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
|
|
|
87 |
peewee_logger = logging.getLogger("peewee")
|
88 |
peewee_logger.propagate = False
|
89 |
# rag_arch.common.log.ROpenHandler
|
90 |
+
peewee_logger.addHandler(logger.handlers[0])
|
91 |
+
peewee_logger.setLevel(logger.handlers[0].level)
|
92 |
|
93 |
thr = ThreadPoolExecutor(max_workers=1)
|
94 |
thr.submit(update_progress)
|
95 |
|
96 |
# start http server
|
97 |
try:
|
98 |
+
logger.info("RAG Flow http server start...")
|
99 |
werkzeug_logger = logging.getLogger("werkzeug")
|
100 |
+
for h in logger.handlers:
|
101 |
werkzeug_logger.addHandler(h)
|
102 |
run_simple(
|
103 |
hostname=HOST,
|
api/settings.py
CHANGED
@@ -17,24 +17,9 @@ import os
|
|
17 |
from datetime import date
|
18 |
from enum import IntEnum, Enum
|
19 |
from api.utils.file_utils import get_project_base_directory
|
20 |
-
from api.utils.log_utils import LoggerFactory, getLogger
|
21 |
import rag.utils.es_conn
|
22 |
import rag.utils.infinity_conn
|
23 |
|
24 |
-
# Logger
|
25 |
-
LoggerFactory.set_directory(
|
26 |
-
os.path.join(
|
27 |
-
get_project_base_directory(),
|
28 |
-
"logs",
|
29 |
-
"api"))
|
30 |
-
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
|
31 |
-
LoggerFactory.LEVEL = 30
|
32 |
-
|
33 |
-
stat_logger = getLogger("stat")
|
34 |
-
access_logger = getLogger("access")
|
35 |
-
database_logger = getLogger("database")
|
36 |
-
chat_logger = getLogger("chat")
|
37 |
-
|
38 |
import rag.utils
|
39 |
from rag.nlp import search
|
40 |
from graphrag import search as kg_search
|
@@ -47,8 +32,6 @@ TEMP_DIRECTORY = os.path.join(get_project_base_directory(), "temp")
|
|
47 |
RAG_FLOW_CONF_PATH = os.path.join(get_project_base_directory(), "conf")
|
48 |
LIGHTEN = int(os.environ.get('LIGHTEN', "0"))
|
49 |
|
50 |
-
SUBPROCESS_STD_LOG_NAME = "std.log"
|
51 |
-
|
52 |
ERROR_REPORT = True
|
53 |
ERROR_REPORT_WITH_PATH = False
|
54 |
|
|
|
17 |
from datetime import date
|
18 |
from enum import IntEnum, Enum
|
19 |
from api.utils.file_utils import get_project_base_directory
|
|
|
20 |
import rag.utils.es_conn
|
21 |
import rag.utils.infinity_conn
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
import rag.utils
|
24 |
from rag.nlp import search
|
25 |
from graphrag import search as kg_search
|
|
|
32 |
RAG_FLOW_CONF_PATH = os.path.join(get_project_base_directory(), "conf")
|
33 |
LIGHTEN = int(os.environ.get('LIGHTEN', "0"))
|
34 |
|
|
|
|
|
35 |
ERROR_REPORT = True
|
36 |
ERROR_REPORT_WITH_PATH = False
|
37 |
|
api/utils/api_utils.py
CHANGED
@@ -35,11 +35,12 @@ from werkzeug.http import HTTP_STATUS_CODES
|
|
35 |
from api.db.db_models import APIToken
|
36 |
from api.settings import (
|
37 |
REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC,
|
38 |
-
|
39 |
)
|
40 |
from api.settings import RetCode
|
41 |
from api.utils import CustomJSONEncoder, get_uuid
|
42 |
from api.utils import json_dumps
|
|
|
43 |
|
44 |
requests.models.complexjson.dumps = functools.partial(
|
45 |
json.dumps, cls=CustomJSONEncoder)
|
@@ -117,7 +118,7 @@ def get_data_error_result(code=RetCode.DATA_ERROR,
|
|
117 |
|
118 |
|
119 |
def server_error_response(e):
|
120 |
-
|
121 |
try:
|
122 |
if e.code == 401:
|
123 |
return get_json_result(code=401, message=repr(e))
|
@@ -258,7 +259,7 @@ def construct_json_result(code=RetCode.SUCCESS, message='success', data=None):
|
|
258 |
|
259 |
|
260 |
def construct_error_response(e):
|
261 |
-
|
262 |
try:
|
263 |
if e.code == 401:
|
264 |
return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e))
|
|
|
35 |
from api.db.db_models import APIToken
|
36 |
from api.settings import (
|
37 |
REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC,
|
38 |
+
CLIENT_AUTHENTICATION, HTTP_APP_KEY, SECRET_KEY
|
39 |
)
|
40 |
from api.settings import RetCode
|
41 |
from api.utils import CustomJSONEncoder, get_uuid
|
42 |
from api.utils import json_dumps
|
43 |
+
from api.utils.log_utils import logger
|
44 |
|
45 |
requests.models.complexjson.dumps = functools.partial(
|
46 |
json.dumps, cls=CustomJSONEncoder)
|
|
|
118 |
|
119 |
|
120 |
def server_error_response(e):
|
121 |
+
logger.exception(e)
|
122 |
try:
|
123 |
if e.code == 401:
|
124 |
return get_json_result(code=401, message=repr(e))
|
|
|
259 |
|
260 |
|
261 |
def construct_error_response(e):
|
262 |
+
logger.exception(e)
|
263 |
try:
|
264 |
if e.code == 401:
|
265 |
return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e))
|
api/utils/log_utils.py
CHANGED
@@ -14,300 +14,38 @@
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import os
|
17 |
-
import typing
|
18 |
-
import traceback
|
19 |
import logging
|
20 |
-
import
|
21 |
-
from logging.handlers import TimedRotatingFileHandler
|
22 |
-
from threading import RLock
|
23 |
|
24 |
-
from api.utils import
|
25 |
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
logging.basicConfig(format=LOG_FORMAT)
|
31 |
-
LEVEL = logging.DEBUG
|
32 |
-
logger_dict = {}
|
33 |
-
global_handler_dict = {}
|
34 |
-
|
35 |
-
LOG_DIR = None
|
36 |
-
PARENT_LOG_DIR = None
|
37 |
-
log_share = True
|
38 |
-
|
39 |
-
append_to_parent_log = None
|
40 |
-
|
41 |
-
lock = RLock()
|
42 |
-
# CRITICAL = 50
|
43 |
-
# FATAL = CRITICAL
|
44 |
-
# ERROR = 40
|
45 |
-
# WARNING = 30
|
46 |
-
# WARN = WARNING
|
47 |
-
# INFO = 20
|
48 |
-
# DEBUG = 10
|
49 |
-
# NOTSET = 0
|
50 |
-
levels = (10, 20, 30, 40)
|
51 |
-
schedule_logger_dict = {}
|
52 |
-
|
53 |
-
@staticmethod
|
54 |
-
def set_directory(directory=None, parent_log_dir=None,
|
55 |
-
append_to_parent_log=None, force=False):
|
56 |
-
if parent_log_dir:
|
57 |
-
LoggerFactory.PARENT_LOG_DIR = parent_log_dir
|
58 |
-
if append_to_parent_log:
|
59 |
-
LoggerFactory.append_to_parent_log = append_to_parent_log
|
60 |
-
with LoggerFactory.lock:
|
61 |
-
if not directory:
|
62 |
-
directory = file_utils.get_project_base_directory("logs")
|
63 |
-
if not LoggerFactory.LOG_DIR or force:
|
64 |
-
LoggerFactory.LOG_DIR = directory
|
65 |
-
if LoggerFactory.log_share:
|
66 |
-
oldmask = os.umask(000)
|
67 |
-
os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True)
|
68 |
-
os.umask(oldmask)
|
69 |
-
else:
|
70 |
-
os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True)
|
71 |
-
for loggerName, ghandler in LoggerFactory.global_handler_dict.items():
|
72 |
-
for className, (logger,
|
73 |
-
handler) in LoggerFactory.logger_dict.items():
|
74 |
-
logger.removeHandler(ghandler)
|
75 |
-
ghandler.close()
|
76 |
-
LoggerFactory.global_handler_dict = {}
|
77 |
-
for className, (logger,
|
78 |
-
handler) in LoggerFactory.logger_dict.items():
|
79 |
-
logger.removeHandler(handler)
|
80 |
-
_handler = None
|
81 |
-
if handler:
|
82 |
-
handler.close()
|
83 |
-
if className != "default":
|
84 |
-
_handler = LoggerFactory.get_handler(className)
|
85 |
-
logger.addHandler(_handler)
|
86 |
-
LoggerFactory.assemble_global_handler(logger)
|
87 |
-
LoggerFactory.logger_dict[className] = logger, _handler
|
88 |
-
|
89 |
-
@staticmethod
|
90 |
-
def new_logger(name):
|
91 |
-
logger = logging.getLogger(name)
|
92 |
-
logger.propagate = False
|
93 |
-
logger.setLevel(LoggerFactory.LEVEL)
|
94 |
return logger
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
logger, handler = LoggerFactory.logger_dict[class_name]
|
101 |
-
if not logger:
|
102 |
-
logger, handler = LoggerFactory.init_logger(class_name)
|
103 |
-
else:
|
104 |
-
logger, handler = LoggerFactory.init_logger(class_name)
|
105 |
-
return logger
|
106 |
-
|
107 |
-
@staticmethod
|
108 |
-
def get_global_handler(logger_name, level=None, log_dir=None):
|
109 |
-
if not LoggerFactory.LOG_DIR:
|
110 |
-
return logging.StreamHandler()
|
111 |
-
if log_dir:
|
112 |
-
logger_name_key = logger_name + "_" + log_dir
|
113 |
-
else:
|
114 |
-
logger_name_key = logger_name + "_" + LoggerFactory.LOG_DIR
|
115 |
-
# if loggerName not in LoggerFactory.globalHandlerDict:
|
116 |
-
if logger_name_key not in LoggerFactory.global_handler_dict:
|
117 |
-
with LoggerFactory.lock:
|
118 |
-
if logger_name_key not in LoggerFactory.global_handler_dict:
|
119 |
-
handler = LoggerFactory.get_handler(
|
120 |
-
logger_name, level, log_dir)
|
121 |
-
LoggerFactory.global_handler_dict[logger_name_key] = handler
|
122 |
-
return LoggerFactory.global_handler_dict[logger_name_key]
|
123 |
-
|
124 |
-
@staticmethod
|
125 |
-
def get_handler(class_name, level=None, log_dir=None,
|
126 |
-
log_type=None, job_id=None):
|
127 |
-
if not log_type:
|
128 |
-
if not LoggerFactory.LOG_DIR or not class_name:
|
129 |
-
return logging.StreamHandler()
|
130 |
-
# return Diy_StreamHandler()
|
131 |
-
|
132 |
-
if not log_dir:
|
133 |
-
log_file = os.path.join(
|
134 |
-
LoggerFactory.LOG_DIR,
|
135 |
-
"{}.log".format(class_name))
|
136 |
-
else:
|
137 |
-
log_file = os.path.join(log_dir, "{}.log".format(class_name))
|
138 |
-
else:
|
139 |
-
log_file = os.path.join(log_dir, "rag_flow_{}.log".format(
|
140 |
-
log_type) if level == LoggerFactory.LEVEL else 'rag_flow_{}_error.log'.format(log_type))
|
141 |
-
|
142 |
-
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
143 |
-
if LoggerFactory.log_share:
|
144 |
-
handler = ROpenHandler(log_file,
|
145 |
-
when='D',
|
146 |
-
interval=1,
|
147 |
-
backupCount=14,
|
148 |
-
delay=True)
|
149 |
-
else:
|
150 |
-
handler = TimedRotatingFileHandler(log_file,
|
151 |
-
when='D',
|
152 |
-
interval=1,
|
153 |
-
backupCount=14,
|
154 |
-
delay=True)
|
155 |
-
if level:
|
156 |
-
handler.level = level
|
157 |
-
|
158 |
-
return handler
|
159 |
-
|
160 |
-
@staticmethod
|
161 |
-
def init_logger(class_name):
|
162 |
-
with LoggerFactory.lock:
|
163 |
-
logger = LoggerFactory.new_logger(class_name)
|
164 |
-
handler = None
|
165 |
-
if class_name:
|
166 |
-
handler = LoggerFactory.get_handler(class_name)
|
167 |
-
logger.addHandler(handler)
|
168 |
-
LoggerFactory.logger_dict[class_name] = logger, handler
|
169 |
-
|
170 |
-
else:
|
171 |
-
LoggerFactory.logger_dict["default"] = logger, handler
|
172 |
-
|
173 |
-
LoggerFactory.assemble_global_handler(logger)
|
174 |
-
return logger, handler
|
175 |
-
|
176 |
-
@staticmethod
|
177 |
-
def assemble_global_handler(logger):
|
178 |
-
if LoggerFactory.LOG_DIR:
|
179 |
-
for level in LoggerFactory.levels:
|
180 |
-
if level >= LoggerFactory.LEVEL:
|
181 |
-
level_logger_name = logging._levelToName[level]
|
182 |
-
logger.addHandler(
|
183 |
-
LoggerFactory.get_global_handler(
|
184 |
-
level_logger_name, level))
|
185 |
-
if LoggerFactory.append_to_parent_log and LoggerFactory.PARENT_LOG_DIR:
|
186 |
-
for level in LoggerFactory.levels:
|
187 |
-
if level >= LoggerFactory.LEVEL:
|
188 |
-
level_logger_name = logging._levelToName[level]
|
189 |
-
logger.addHandler(
|
190 |
-
LoggerFactory.get_global_handler(level_logger_name, level, LoggerFactory.PARENT_LOG_DIR))
|
191 |
-
|
192 |
-
|
193 |
-
def setDirectory(directory=None):
|
194 |
-
LoggerFactory.set_directory(directory)
|
195 |
-
|
196 |
-
|
197 |
-
def setLevel(level):
|
198 |
-
LoggerFactory.LEVEL = level
|
199 |
-
|
200 |
-
|
201 |
-
def getLogger(className=None, useLevelFile=False):
|
202 |
-
if className is None:
|
203 |
-
frame = inspect.stack()[1]
|
204 |
-
module = inspect.getmodule(frame[0])
|
205 |
-
className = 'stat'
|
206 |
-
return LoggerFactory.get_logger(className)
|
207 |
-
|
208 |
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
211 |
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
-
class ROpenHandler(TimedRotatingFileHandler):
|
214 |
-
def _open(self):
|
215 |
-
prevumask = os.umask(000)
|
216 |
-
rtv = TimedRotatingFileHandler._open(self)
|
217 |
-
os.umask(prevumask)
|
218 |
-
return rtv
|
219 |
-
|
220 |
-
|
221 |
-
def sql_logger(job_id='', log_type='sql'):
|
222 |
-
key = job_id + log_type
|
223 |
-
if key in LoggerFactory.schedule_logger_dict.keys():
|
224 |
-
return LoggerFactory.schedule_logger_dict[key]
|
225 |
-
return get_job_logger(job_id=job_id, log_type=log_type)
|
226 |
-
|
227 |
-
|
228 |
-
def ready_log(msg, job=None, task=None, role=None, party_id=None, detail=None):
|
229 |
-
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
230 |
-
return f"{prefix}{msg} ready{suffix}"
|
231 |
-
|
232 |
-
|
233 |
-
def start_log(msg, job=None, task=None, role=None, party_id=None, detail=None):
|
234 |
-
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
235 |
-
return f"{prefix}start to {msg}{suffix}"
|
236 |
-
|
237 |
-
|
238 |
-
def successful_log(msg, job=None, task=None, role=None,
|
239 |
-
party_id=None, detail=None):
|
240 |
-
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
241 |
-
return f"{prefix}{msg} successfully{suffix}"
|
242 |
-
|
243 |
-
|
244 |
-
def warning_log(msg, job=None, task=None, role=None,
|
245 |
-
party_id=None, detail=None):
|
246 |
-
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
247 |
-
return f"{prefix}{msg} is not effective{suffix}"
|
248 |
-
|
249 |
-
|
250 |
-
def failed_log(msg, job=None, task=None, role=None,
|
251 |
-
party_id=None, detail=None):
|
252 |
-
prefix, suffix = base_msg(job, task, role, party_id, detail)
|
253 |
-
return f"{prefix}failed to {msg}{suffix}"
|
254 |
-
|
255 |
-
|
256 |
-
def base_msg(job=None, task=None, role: str = None,
|
257 |
-
party_id: typing.Union[str, int] = None, detail=None):
|
258 |
-
if detail:
|
259 |
-
detail_msg = f" detail: \n{detail}"
|
260 |
-
else:
|
261 |
-
detail_msg = ""
|
262 |
-
if task is not None:
|
263 |
-
return f"task {task.f_task_id} {task.f_task_version} ", f" on {task.f_role} {task.f_party_id}{detail_msg}"
|
264 |
-
elif job is not None:
|
265 |
-
return "", f" on {job.f_role} {job.f_party_id}{detail_msg}"
|
266 |
-
elif role and party_id:
|
267 |
-
return "", f" on {role} {party_id}{detail_msg}"
|
268 |
-
else:
|
269 |
-
return "", f"{detail_msg}"
|
270 |
-
|
271 |
-
|
272 |
-
def exception_to_trace_string(ex):
|
273 |
-
return "".join(traceback.TracebackException.from_exception(ex).format())
|
274 |
-
|
275 |
-
|
276 |
-
def get_logger_base_dir():
|
277 |
-
job_log_dir = file_utils.get_rag_flow_directory('logs')
|
278 |
-
return job_log_dir
|
279 |
-
|
280 |
-
|
281 |
-
def get_job_logger(job_id, log_type):
|
282 |
-
rag_flow_log_dir = file_utils.get_rag_flow_directory('logs', 'rag_flow')
|
283 |
-
job_log_dir = file_utils.get_rag_flow_directory('logs', job_id)
|
284 |
-
if not job_id:
|
285 |
-
log_dirs = [rag_flow_log_dir]
|
286 |
-
else:
|
287 |
-
if log_type == 'audit':
|
288 |
-
log_dirs = [job_log_dir, rag_flow_log_dir]
|
289 |
-
else:
|
290 |
-
log_dirs = [job_log_dir]
|
291 |
-
if LoggerFactory.log_share:
|
292 |
-
oldmask = os.umask(000)
|
293 |
-
os.makedirs(job_log_dir, exist_ok=True)
|
294 |
-
os.makedirs(rag_flow_log_dir, exist_ok=True)
|
295 |
-
os.umask(oldmask)
|
296 |
-
else:
|
297 |
-
os.makedirs(job_log_dir, exist_ok=True)
|
298 |
-
os.makedirs(rag_flow_log_dir, exist_ok=True)
|
299 |
-
logger = LoggerFactory.new_logger(f"{job_id}_{log_type}")
|
300 |
-
for job_log_dir in log_dirs:
|
301 |
-
handler = LoggerFactory.get_handler(class_name=None, level=LoggerFactory.LEVEL,
|
302 |
-
log_dir=job_log_dir, log_type=log_type, job_id=job_id)
|
303 |
-
error_handler = LoggerFactory.get_handler(
|
304 |
-
class_name=None,
|
305 |
-
level=logging.ERROR,
|
306 |
-
log_dir=job_log_dir,
|
307 |
-
log_type=log_type,
|
308 |
-
job_id=job_id)
|
309 |
-
logger.addHandler(handler)
|
310 |
-
logger.addHandler(error_handler)
|
311 |
-
with LoggerFactory.lock:
|
312 |
-
LoggerFactory.schedule_logger_dict[job_id + log_type] = logger
|
313 |
return logger
|
|
|
|
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import os
|
|
|
|
|
17 |
import logging
|
18 |
+
from logging.handlers import RotatingFileHandler
|
|
|
|
|
19 |
|
20 |
+
from api.utils.file_utils import get_project_base_directory
|
21 |
|
22 |
+
LOG_LEVEL = logging.INFO
|
23 |
+
LOG_FILE = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"ragflow_{os.getpid()}.log"))
|
24 |
+
LOG_FORMAT = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"
|
25 |
+
logger = None
|
26 |
|
27 |
+
def getLogger():
|
28 |
+
global logger
|
29 |
+
if logger is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
return logger
|
31 |
|
32 |
+
print(f"log file path: {LOG_FILE}")
|
33 |
+
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
34 |
+
logger = logging.getLogger("ragflow")
|
35 |
+
logger.setLevel(LOG_LEVEL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
handler1 = RotatingFileHandler(LOG_FILE, maxBytes=10*1024*1024, backupCount=5)
|
38 |
+
handler1.setLevel(LOG_LEVEL)
|
39 |
+
formatter1 = logging.Formatter(LOG_FORMAT)
|
40 |
+
handler1.setFormatter(formatter1)
|
41 |
+
logger.addHandler(handler1)
|
42 |
|
43 |
+
handler2 = logging.StreamHandler()
|
44 |
+
handler2.setLevel(LOG_LEVEL)
|
45 |
+
formatter2 = logging.Formatter(LOG_FORMAT)
|
46 |
+
handler2.setFormatter(formatter2)
|
47 |
+
logger.addHandler(handler2)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
return logger
|
50 |
+
|
51 |
+
logger = getLogger()
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -19,13 +19,14 @@ from io import BytesIO
|
|
19 |
import re
|
20 |
import pdfplumber
|
21 |
import logging
|
22 |
-
from PIL import Image
|
23 |
import numpy as np
|
24 |
from timeit import default_timer as timer
|
25 |
from pypdf import PdfReader as pdf2_read
|
26 |
|
27 |
from api.settings import LIGHTEN
|
28 |
from api.utils.file_utils import get_project_base_directory
|
|
|
29 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
30 |
from rag.nlp import rag_tokenizer
|
31 |
from copy import deepcopy
|
@@ -49,15 +50,15 @@ class RAGFlowPdfParser:
|
|
49 |
import torch
|
50 |
if torch.cuda.is_available():
|
51 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
52 |
-
except Exception
|
53 |
-
|
54 |
try:
|
55 |
model_dir = os.path.join(
|
56 |
get_project_base_directory(),
|
57 |
"rag/res/deepdoc")
|
58 |
self.updown_cnt_mdl.load_model(os.path.join(
|
59 |
model_dir, "updown_concat_xgb.model"))
|
60 |
-
except Exception
|
61 |
model_dir = snapshot_download(
|
62 |
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
63 |
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
|
@@ -187,7 +188,7 @@ class RAGFlowPdfParser:
|
|
187 |
return True
|
188 |
|
189 |
def _table_transformer_job(self, ZM):
|
190 |
-
|
191 |
imgs, pos = [], []
|
192 |
tbcnt = [0]
|
193 |
MARGIN = 10
|
@@ -425,12 +426,12 @@ class RAGFlowPdfParser:
|
|
425 |
detach_feats = [b["x1"] < b_["x0"],
|
426 |
b["x0"] > b_["x1"]]
|
427 |
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
|
428 |
-
|
429 |
b["text"],
|
430 |
b_["text"],
|
431 |
any(feats),
|
432 |
any(concatting_feats),
|
433 |
-
|
434 |
i += 1
|
435 |
continue
|
436 |
# merge up and down
|
@@ -726,14 +727,14 @@ class RAGFlowPdfParser:
|
|
726 |
# continue
|
727 |
if tv < fv and tk:
|
728 |
tables[tk].insert(0, c)
|
729 |
-
|
730 |
"TABLE:" +
|
731 |
self.boxes[i]["text"] +
|
732 |
"; Cap: " +
|
733 |
tk)
|
734 |
elif fk:
|
735 |
figures[fk].insert(0, c)
|
736 |
-
|
737 |
"FIGURE:" +
|
738 |
self.boxes[i]["text"] +
|
739 |
"; Cap: " +
|
@@ -760,7 +761,7 @@ class RAGFlowPdfParser:
|
|
760 |
if ii is not None:
|
761 |
b = louts[ii]
|
762 |
else:
|
763 |
-
|
764 |
f"Missing layout match: {pn + 1},%s" %
|
765 |
(bxs[0].get(
|
766 |
"layoutno", "")))
|
@@ -918,8 +919,8 @@ class RAGFlowPdfParser:
|
|
918 |
if usefull(boxes[0]):
|
919 |
dfs(boxes[0], 0)
|
920 |
else:
|
921 |
-
|
922 |
-
except Exception
|
923 |
pass
|
924 |
boxes.pop(0)
|
925 |
mw = np.mean(widths)
|
@@ -927,7 +928,7 @@ class RAGFlowPdfParser:
|
|
927 |
res.append(
|
928 |
"\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
|
929 |
else:
|
930 |
-
|
931 |
"<<".join([c["text"] for c in lines]))
|
932 |
|
933 |
return "\n\n".join(res)
|
@@ -938,8 +939,8 @@ class RAGFlowPdfParser:
|
|
938 |
pdf = pdfplumber.open(
|
939 |
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
940 |
return len(pdf.pages)
|
941 |
-
except Exception
|
942 |
-
|
943 |
|
944 |
def __images__(self, fnm, zoomin=3, page_from=0,
|
945 |
page_to=299, callback=None):
|
@@ -962,8 +963,8 @@ class RAGFlowPdfParser:
|
|
962 |
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
963 |
self.pdf.pages[page_from:page_to]]
|
964 |
self.total_page = len(self.pdf.pages)
|
965 |
-
except Exception
|
966 |
-
|
967 |
|
968 |
self.outlines = []
|
969 |
try:
|
@@ -979,11 +980,11 @@ class RAGFlowPdfParser:
|
|
979 |
|
980 |
dfs(outlines, 0)
|
981 |
except Exception as e:
|
982 |
-
|
983 |
if not self.outlines:
|
984 |
-
|
985 |
|
986 |
-
|
987 |
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
|
988 |
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
|
989 |
range(len(self.page_chars))]
|
@@ -1023,7 +1024,7 @@ class RAGFlowPdfParser:
|
|
1023 |
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
|
1024 |
"".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
1025 |
|
1026 |
-
|
1027 |
|
1028 |
self.page_cum_height = np.cumsum(self.page_cum_height)
|
1029 |
assert len(self.page_cum_height) == len(self.page_images) + 1
|
@@ -1162,10 +1163,10 @@ class PlainParser(object):
|
|
1162 |
dfs(a, depth + 1)
|
1163 |
|
1164 |
dfs(outlines, 0)
|
1165 |
-
except Exception
|
1166 |
-
|
1167 |
if not self.outlines:
|
1168 |
-
|
1169 |
|
1170 |
return [(l, "") for l in lines], []
|
1171 |
|
|
|
19 |
import re
|
20 |
import pdfplumber
|
21 |
import logging
|
22 |
+
from PIL import Image
|
23 |
import numpy as np
|
24 |
from timeit import default_timer as timer
|
25 |
from pypdf import PdfReader as pdf2_read
|
26 |
|
27 |
from api.settings import LIGHTEN
|
28 |
from api.utils.file_utils import get_project_base_directory
|
29 |
+
from api.utils.log_utils import logger
|
30 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
31 |
from rag.nlp import rag_tokenizer
|
32 |
from copy import deepcopy
|
|
|
50 |
import torch
|
51 |
if torch.cuda.is_available():
|
52 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
53 |
+
except Exception:
|
54 |
+
logger.exception("RAGFlowPdfParser __init__")
|
55 |
try:
|
56 |
model_dir = os.path.join(
|
57 |
get_project_base_directory(),
|
58 |
"rag/res/deepdoc")
|
59 |
self.updown_cnt_mdl.load_model(os.path.join(
|
60 |
model_dir, "updown_concat_xgb.model"))
|
61 |
+
except Exception:
|
62 |
model_dir = snapshot_download(
|
63 |
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
64 |
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
|
|
|
188 |
return True
|
189 |
|
190 |
def _table_transformer_job(self, ZM):
|
191 |
+
logger.info("Table processing...")
|
192 |
imgs, pos = [], []
|
193 |
tbcnt = [0]
|
194 |
MARGIN = 10
|
|
|
426 |
detach_feats = [b["x1"] < b_["x0"],
|
427 |
b["x0"] > b_["x1"]]
|
428 |
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
|
429 |
+
logger.info("{} {} {} {}".format(
|
430 |
b["text"],
|
431 |
b_["text"],
|
432 |
any(feats),
|
433 |
any(concatting_feats),
|
434 |
+
))
|
435 |
i += 1
|
436 |
continue
|
437 |
# merge up and down
|
|
|
727 |
# continue
|
728 |
if tv < fv and tk:
|
729 |
tables[tk].insert(0, c)
|
730 |
+
logger.debug(
|
731 |
"TABLE:" +
|
732 |
self.boxes[i]["text"] +
|
733 |
"; Cap: " +
|
734 |
tk)
|
735 |
elif fk:
|
736 |
figures[fk].insert(0, c)
|
737 |
+
logger.debug(
|
738 |
"FIGURE:" +
|
739 |
self.boxes[i]["text"] +
|
740 |
"; Cap: " +
|
|
|
761 |
if ii is not None:
|
762 |
b = louts[ii]
|
763 |
else:
|
764 |
+
logger.warn(
|
765 |
f"Missing layout match: {pn + 1},%s" %
|
766 |
(bxs[0].get(
|
767 |
"layoutno", "")))
|
|
|
919 |
if usefull(boxes[0]):
|
920 |
dfs(boxes[0], 0)
|
921 |
else:
|
922 |
+
logger.debug("WASTE: " + boxes[0]["text"])
|
923 |
+
except Exception:
|
924 |
pass
|
925 |
boxes.pop(0)
|
926 |
mw = np.mean(widths)
|
|
|
928 |
res.append(
|
929 |
"\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
|
930 |
else:
|
931 |
+
logger.debug("REMOVED: " +
|
932 |
"<<".join([c["text"] for c in lines]))
|
933 |
|
934 |
return "\n\n".join(res)
|
|
|
939 |
pdf = pdfplumber.open(
|
940 |
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
941 |
return len(pdf.pages)
|
942 |
+
except Exception:
|
943 |
+
logger.exception("total_page_number")
|
944 |
|
945 |
def __images__(self, fnm, zoomin=3, page_from=0,
|
946 |
page_to=299, callback=None):
|
|
|
963 |
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
|
964 |
self.pdf.pages[page_from:page_to]]
|
965 |
self.total_page = len(self.pdf.pages)
|
966 |
+
except Exception:
|
967 |
+
logger.exception("RAGFlowPdfParser __images__")
|
968 |
|
969 |
self.outlines = []
|
970 |
try:
|
|
|
980 |
|
981 |
dfs(outlines, 0)
|
982 |
except Exception as e:
|
983 |
+
logger.warning(f"Outlines exception: {e}")
|
984 |
if not self.outlines:
|
985 |
+
logger.warning("Miss outlines")
|
986 |
|
987 |
+
logger.info("Images converted.")
|
988 |
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
|
989 |
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
|
990 |
range(len(self.page_chars))]
|
|
|
1024 |
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
|
1025 |
"".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
1026 |
|
1027 |
+
logger.info("Is it English:", self.is_english)
|
1028 |
|
1029 |
self.page_cum_height = np.cumsum(self.page_cum_height)
|
1030 |
assert len(self.page_cum_height) == len(self.page_images) + 1
|
|
|
1163 |
dfs(a, depth + 1)
|
1164 |
|
1165 |
dfs(outlines, 0)
|
1166 |
+
except Exception:
|
1167 |
+
logger.exception("Outlines exception")
|
1168 |
if not self.outlines:
|
1169 |
+
logger.warning("Miss outlines")
|
1170 |
|
1171 |
return [(l, "") for l in lines], []
|
1172 |
|
deepdoc/parser/resume/entities/corporations.py
CHANGED
@@ -11,10 +11,15 @@
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
|
14 |
-
import re
|
|
|
|
|
15 |
import pandas as pd
|
16 |
from rag.nlp import rag_tokenizer
|
17 |
from . import regions
|
|
|
|
|
|
|
18 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
19 |
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
20 |
GOODS["cid"] = GOODS["cid"].astype(str)
|
@@ -27,7 +32,7 @@ def baike(cid, default_v=0):
|
|
27 |
global GOODS
|
28 |
try:
|
29 |
return GOODS.loc[str(cid), "len"]
|
30 |
-
except Exception
|
31 |
pass
|
32 |
return default_v
|
33 |
|
@@ -65,7 +70,8 @@ def rmNoise(n):
|
|
65 |
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
66 |
for c,v in CORP_TAG.items():
|
67 |
cc = corpNorm(rmNoise(c), False)
|
68 |
-
if not cc:
|
|
|
69 |
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
70 |
|
71 |
def is_good(nm):
|
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
|
14 |
+
import re
|
15 |
+
import json
|
16 |
+
import os
|
17 |
import pandas as pd
|
18 |
from rag.nlp import rag_tokenizer
|
19 |
from . import regions
|
20 |
+
from api.utils.log_utils import logger
|
21 |
+
|
22 |
+
|
23 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
24 |
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
25 |
GOODS["cid"] = GOODS["cid"].astype(str)
|
|
|
32 |
global GOODS
|
33 |
try:
|
34 |
return GOODS.loc[str(cid), "len"]
|
35 |
+
except Exception:
|
36 |
pass
|
37 |
return default_v
|
38 |
|
|
|
70 |
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
71 |
for c,v in CORP_TAG.items():
|
72 |
cc = corpNorm(rmNoise(c), False)
|
73 |
+
if not cc:
|
74 |
+
logger.info(c)
|
75 |
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
76 |
|
77 |
def is_good(nm):
|
deepdoc/parser/resume/step_two.py
CHANGED
@@ -11,13 +11,19 @@
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
|
14 |
-
import re
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
import numpy as np
|
17 |
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
18 |
from rag.nlp import rag_tokenizer, surname
|
19 |
from xpinyin import Pinyin
|
20 |
from contextlib import contextmanager
|
|
|
21 |
|
22 |
|
23 |
class TimeoutException(Exception): pass
|
@@ -79,7 +85,7 @@ def forEdu(cv):
|
|
79 |
y, m, d = getYMD(dt)
|
80 |
st_dt.append(str(y))
|
81 |
e["start_dt_kwd"] = str(y)
|
82 |
-
except Exception
|
83 |
pass
|
84 |
|
85 |
r = schools.select(n.get("school_name", ""))
|
@@ -158,7 +164,7 @@ def forEdu(cv):
|
|
158 |
y, m, d = getYMD(edu_end_dt)
|
159 |
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
160 |
except Exception as e:
|
161 |
-
|
162 |
if sch:
|
163 |
cv["school_name_kwd"] = sch
|
164 |
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
@@ -233,7 +239,7 @@ def forWork(cv):
|
|
233 |
if type(n) == type(""):
|
234 |
try:
|
235 |
n = json_loads(n)
|
236 |
-
except Exception
|
237 |
continue
|
238 |
|
239 |
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
@@ -269,8 +275,8 @@ def forWork(cv):
|
|
269 |
|
270 |
try:
|
271 |
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
272 |
-
except Exception
|
273 |
-
|
274 |
|
275 |
if n.get("scale"):
|
276 |
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
@@ -327,7 +333,7 @@ def forWork(cv):
|
|
327 |
y, m, d = getYMD(work_st_tm)
|
328 |
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
329 |
except Exception as e:
|
330 |
-
|
331 |
|
332 |
cv["job_num_int"] = 0
|
333 |
if duas:
|
@@ -457,8 +463,8 @@ def parse(cv):
|
|
457 |
t = k[:-4]
|
458 |
cv[f"{t}_kwd"] = nms
|
459 |
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
460 |
-
except Exception
|
461 |
-
|
462 |
cv[k] = []
|
463 |
|
464 |
# tokenize fields
|
@@ -524,7 +530,7 @@ def parse(cv):
|
|
524 |
if not y: y = "2012"
|
525 |
if not m: m = "01"
|
526 |
if not d: d = "01"
|
527 |
-
cv["updated_at_dt"] =
|
528 |
# long text tokenize
|
529 |
|
530 |
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
@@ -556,10 +562,10 @@ def parse(cv):
|
|
556 |
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
557 |
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
558 |
y, m, d = getYMD(str(cv["work_start_time"]))
|
559 |
-
cv["work_start_dt"] =
|
560 |
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
561 |
except Exception as e:
|
562 |
-
|
563 |
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
564 |
|
565 |
keys = list(cv.keys())
|
@@ -574,7 +580,7 @@ def parse(cv):
|
|
574 |
|
575 |
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
576 |
cv["id"] = cv["tob_resume_id"]
|
577 |
-
|
578 |
|
579 |
return dealWithInt64(cv)
|
580 |
|
@@ -589,4 +595,3 @@ def dealWithInt64(d):
|
|
589 |
|
590 |
if isinstance(d, np.integer): d = int(d)
|
591 |
return d
|
592 |
-
|
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
|
14 |
+
import re
|
15 |
+
import copy
|
16 |
+
import time
|
17 |
+
import datetime
|
18 |
+
import demjson3
|
19 |
+
import traceback
|
20 |
+
import signal
|
21 |
import numpy as np
|
22 |
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
23 |
from rag.nlp import rag_tokenizer, surname
|
24 |
from xpinyin import Pinyin
|
25 |
from contextlib import contextmanager
|
26 |
+
from api.utils.log_utils import logger
|
27 |
|
28 |
|
29 |
class TimeoutException(Exception): pass
|
|
|
85 |
y, m, d = getYMD(dt)
|
86 |
st_dt.append(str(y))
|
87 |
e["start_dt_kwd"] = str(y)
|
88 |
+
except Exception:
|
89 |
pass
|
90 |
|
91 |
r = schools.select(n.get("school_name", ""))
|
|
|
164 |
y, m, d = getYMD(edu_end_dt)
|
165 |
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
166 |
except Exception as e:
|
167 |
+
logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
|
168 |
if sch:
|
169 |
cv["school_name_kwd"] = sch
|
170 |
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
|
|
239 |
if type(n) == type(""):
|
240 |
try:
|
241 |
n = json_loads(n)
|
242 |
+
except Exception:
|
243 |
continue
|
244 |
|
245 |
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
|
|
275 |
|
276 |
try:
|
277 |
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
278 |
+
except Exception:
|
279 |
+
logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
|
280 |
|
281 |
if n.get("scale"):
|
282 |
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
|
|
333 |
y, m, d = getYMD(work_st_tm)
|
334 |
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
335 |
except Exception as e:
|
336 |
+
logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
|
337 |
|
338 |
cv["job_num_int"] = 0
|
339 |
if duas:
|
|
|
463 |
t = k[:-4]
|
464 |
cv[f"{t}_kwd"] = nms
|
465 |
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
466 |
+
except Exception:
|
467 |
+
logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
|
468 |
cv[k] = []
|
469 |
|
470 |
# tokenize fields
|
|
|
530 |
if not y: y = "2012"
|
531 |
if not m: m = "01"
|
532 |
if not d: d = "01"
|
533 |
+
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
534 |
# long text tokenize
|
535 |
|
536 |
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
|
|
562 |
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
563 |
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
564 |
y, m, d = getYMD(str(cv["work_start_time"]))
|
565 |
+
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
566 |
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
567 |
except Exception as e:
|
568 |
+
logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
|
569 |
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
570 |
|
571 |
keys = list(cv.keys())
|
|
|
580 |
|
581 |
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
582 |
cv["id"] = cv["tob_resume_id"]
|
583 |
+
logger.info("CCCCCCCCCCCCCCC")
|
584 |
|
585 |
return dealWithInt64(cv)
|
586 |
|
|
|
595 |
|
596 |
if isinstance(d, np.integer): d = int(d)
|
597 |
return d
|
|
deepdoc/vision/operators.py
CHANGED
@@ -20,6 +20,7 @@ import cv2
|
|
20 |
import numpy as np
|
21 |
import math
|
22 |
from PIL import Image
|
|
|
23 |
|
24 |
|
25 |
class DecodeImage(object):
|
@@ -402,7 +403,7 @@ class DetResizeForTest(object):
|
|
402 |
return None, (None, None)
|
403 |
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
404 |
except BaseException:
|
405 |
-
|
406 |
sys.exit(0)
|
407 |
ratio_h = resize_h / float(h)
|
408 |
ratio_w = resize_w / float(w)
|
@@ -452,7 +453,6 @@ class E2EResizeForTest(object):
|
|
452 |
return data
|
453 |
|
454 |
def resize_image_for_totaltext(self, im, max_side_len=512):
|
455 |
-
|
456 |
h, w, _ = im.shape
|
457 |
resize_w = w
|
458 |
resize_h = h
|
|
|
20 |
import numpy as np
|
21 |
import math
|
22 |
from PIL import Image
|
23 |
+
from api.utils.log_utils import logger
|
24 |
|
25 |
|
26 |
class DecodeImage(object):
|
|
|
403 |
return None, (None, None)
|
404 |
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
405 |
except BaseException:
|
406 |
+
logger.exception("{} {} {}".format(img.shape, resize_w, resize_h))
|
407 |
sys.exit(0)
|
408 |
ratio_h = resize_h / float(h)
|
409 |
ratio_w = resize_w / float(w)
|
|
|
453 |
return data
|
454 |
|
455 |
def resize_image_for_totaltext(self, im, max_side_len=512):
|
|
|
456 |
h, w, _ = im.shape
|
457 |
resize_w = w
|
458 |
resize_h = h
|
deepdoc/vision/recognizer.py
CHANGED
@@ -19,6 +19,7 @@ from huggingface_hub import snapshot_download
|
|
19 |
|
20 |
from api.utils.file_utils import get_project_base_directory
|
21 |
from .operators import *
|
|
|
22 |
|
23 |
|
24 |
class Recognizer(object):
|
@@ -439,7 +440,7 @@ class Recognizer(object):
|
|
439 |
end_index = min((i + 1) * batch_size, len(imgs))
|
440 |
batch_image_list = imgs[start_index:end_index]
|
441 |
inputs = self.preprocess(batch_image_list)
|
442 |
-
|
443 |
for ins in inputs:
|
444 |
bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr)
|
445 |
res.append(bb)
|
|
|
19 |
|
20 |
from api.utils.file_utils import get_project_base_directory
|
21 |
from .operators import *
|
22 |
+
from api.utils.log_utils import logger
|
23 |
|
24 |
|
25 |
class Recognizer(object):
|
|
|
440 |
end_index = min((i + 1) * batch_size, len(imgs))
|
441 |
batch_image_list = imgs[start_index:end_index]
|
442 |
inputs = self.preprocess(batch_image_list)
|
443 |
+
logger.info("preprocess")
|
444 |
for ins in inputs:
|
445 |
bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr)
|
446 |
res.append(bb)
|
deepdoc/vision/seeit.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14 |
import os
|
15 |
import PIL
|
16 |
from PIL import ImageDraw
|
|
|
17 |
|
18 |
|
19 |
def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
|
@@ -24,7 +25,7 @@ def save_results(image_list, results, labels, output_dir='output/', threshold=0.
|
|
24 |
|
25 |
out_path = os.path.join(output_dir, f"{idx}.jpg")
|
26 |
im.save(out_path, quality=95)
|
27 |
-
|
28 |
|
29 |
|
30 |
def draw_box(im, result, lables, threshold=0.5):
|
|
|
14 |
import os
|
15 |
import PIL
|
16 |
from PIL import ImageDraw
|
17 |
+
from api.utils.log_utils import logger
|
18 |
|
19 |
|
20 |
def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
|
|
|
25 |
|
26 |
out_path = os.path.join(output_dir, f"{idx}.jpg")
|
27 |
im.save(out_path, quality=95)
|
28 |
+
logger.info("save result to: " + out_path)
|
29 |
|
30 |
|
31 |
def draw_box(im, result, lables, threshold=0.5):
|
deepdoc/vision/t_recognizer.py
CHANGED
@@ -10,7 +10,10 @@
|
|
10 |
# See the License for the specific language governing permissions and
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
-
import os
|
|
|
|
|
|
|
14 |
sys.path.insert(
|
15 |
0,
|
16 |
os.path.abspath(
|
@@ -56,7 +59,7 @@ def main(args):
|
|
56 |
} for t in lyt]
|
57 |
img = draw_box(images[i], lyt, labels, float(args.threshold))
|
58 |
img.save(outputs[i], quality=95)
|
59 |
-
|
60 |
|
61 |
|
62 |
def get_table_html(img, tb_cpns, ocr):
|
|
|
10 |
# See the License for the specific language governing permissions and
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
+
import os
|
14 |
+
import sys
|
15 |
+
from api.utils.log_utils import logger
|
16 |
+
|
17 |
sys.path.insert(
|
18 |
0,
|
19 |
os.path.abspath(
|
|
|
59 |
} for t in lyt]
|
60 |
img = draw_box(images[i], lyt, labels, float(args.threshold))
|
61 |
img.save(outputs[i], quality=95)
|
62 |
+
logger.info("save result to: " + outputs[i])
|
63 |
|
64 |
|
65 |
def get_table_html(img, tb_cpns, ocr):
|
graphrag/claim_extractor.py
CHANGED
@@ -7,7 +7,6 @@ Reference:
|
|
7 |
|
8 |
import argparse
|
9 |
import json
|
10 |
-
import logging
|
11 |
import re
|
12 |
import traceback
|
13 |
from dataclasses import dataclass
|
@@ -18,12 +17,12 @@ import tiktoken
|
|
18 |
from graphrag.claim_prompt import CLAIM_EXTRACTION_PROMPT, CONTINUE_PROMPT, LOOP_PROMPT
|
19 |
from rag.llm.chat_model import Base as CompletionLLM
|
20 |
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements
|
|
|
21 |
|
22 |
DEFAULT_TUPLE_DELIMITER = "<|>"
|
23 |
DEFAULT_RECORD_DELIMITER = "##"
|
24 |
DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
|
25 |
CLAIM_MAX_GLEANINGS = 1
|
26 |
-
log = logging.getLogger(__name__)
|
27 |
|
28 |
|
29 |
@dataclass
|
@@ -127,7 +126,7 @@ class ClaimExtractor:
|
|
127 |
]
|
128 |
source_doc_map[document_id] = text
|
129 |
except Exception as e:
|
130 |
-
|
131 |
self._on_error(
|
132 |
e,
|
133 |
traceback.format_exc(),
|
@@ -266,4 +265,4 @@ if __name__ == "__main__":
|
|
266 |
"claim_description": ""
|
267 |
}
|
268 |
claim = ex(info)
|
269 |
-
|
|
|
7 |
|
8 |
import argparse
|
9 |
import json
|
|
|
10 |
import re
|
11 |
import traceback
|
12 |
from dataclasses import dataclass
|
|
|
17 |
from graphrag.claim_prompt import CLAIM_EXTRACTION_PROMPT, CONTINUE_PROMPT, LOOP_PROMPT
|
18 |
from rag.llm.chat_model import Base as CompletionLLM
|
19 |
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements
|
20 |
+
from api.utils.log_utils import logger
|
21 |
|
22 |
DEFAULT_TUPLE_DELIMITER = "<|>"
|
23 |
DEFAULT_RECORD_DELIMITER = "##"
|
24 |
DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
|
25 |
CLAIM_MAX_GLEANINGS = 1
|
|
|
26 |
|
27 |
|
28 |
@dataclass
|
|
|
126 |
]
|
127 |
source_doc_map[document_id] = text
|
128 |
except Exception as e:
|
129 |
+
logger.exception("error extracting claim")
|
130 |
self._on_error(
|
131 |
e,
|
132 |
traceback.format_exc(),
|
|
|
265 |
"claim_description": ""
|
266 |
}
|
267 |
claim = ex(info)
|
268 |
+
logger.info(json.dumps(claim.output, ensure_ascii=False, indent=2))
|
graphrag/community_reports_extractor.py
CHANGED
@@ -6,11 +6,10 @@ Reference:
|
|
6 |
"""
|
7 |
|
8 |
import json
|
9 |
-
import logging
|
10 |
import re
|
11 |
import traceback
|
12 |
from dataclasses import dataclass
|
13 |
-
from typing import
|
14 |
import networkx as nx
|
15 |
import pandas as pd
|
16 |
from graphrag import leiden
|
@@ -20,8 +19,7 @@ from rag.llm.chat_model import Base as CompletionLLM
|
|
20 |
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements, dict_has_keys_with_types
|
21 |
from rag.utils import num_tokens_from_string
|
22 |
from timeit import default_timer as timer
|
23 |
-
|
24 |
-
log = logging.getLogger(__name__)
|
25 |
|
26 |
|
27 |
@dataclass
|
@@ -82,7 +80,7 @@ class CommunityReportsExtractor:
|
|
82 |
response = re.sub(r"[^\}]*$", "", response)
|
83 |
response = re.sub(r"\{\{", "{", response)
|
84 |
response = re.sub(r"\}\}", "}", response)
|
85 |
-
|
86 |
response = json.loads(response)
|
87 |
if not dict_has_keys_with_types(response, [
|
88 |
("title", str),
|
@@ -94,7 +92,7 @@ class CommunityReportsExtractor:
|
|
94 |
response["weight"] = weight
|
95 |
response["entities"] = ents
|
96 |
except Exception as e:
|
97 |
-
|
98 |
self._on_error(e, traceback.format_exc(), None)
|
99 |
continue
|
100 |
|
@@ -127,5 +125,4 @@ class CommunityReportsExtractor:
|
|
127 |
report_sections = "\n\n".join(
|
128 |
f"## {finding_summary(f)}\n\n{finding_explanation(f)}" for f in findings
|
129 |
)
|
130 |
-
|
131 |
return f"# {title}\n\n{summary}\n\n{report_sections}"
|
|
|
6 |
"""
|
7 |
|
8 |
import json
|
|
|
9 |
import re
|
10 |
import traceback
|
11 |
from dataclasses import dataclass
|
12 |
+
from typing import List, Callable
|
13 |
import networkx as nx
|
14 |
import pandas as pd
|
15 |
from graphrag import leiden
|
|
|
19 |
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements, dict_has_keys_with_types
|
20 |
from rag.utils import num_tokens_from_string
|
21 |
from timeit import default_timer as timer
|
22 |
+
from api.utils.log_utils import logger
|
|
|
23 |
|
24 |
|
25 |
@dataclass
|
|
|
80 |
response = re.sub(r"[^\}]*$", "", response)
|
81 |
response = re.sub(r"\{\{", "{", response)
|
82 |
response = re.sub(r"\}\}", "}", response)
|
83 |
+
logger.info(response)
|
84 |
response = json.loads(response)
|
85 |
if not dict_has_keys_with_types(response, [
|
86 |
("title", str),
|
|
|
92 |
response["weight"] = weight
|
93 |
response["entities"] = ents
|
94 |
except Exception as e:
|
95 |
+
logger.exception("CommunityReportsExtractor got exception")
|
96 |
self._on_error(e, traceback.format_exc(), None)
|
97 |
continue
|
98 |
|
|
|
125 |
report_sections = "\n\n".join(
|
126 |
f"## {finding_summary(f)}\n\n{finding_explanation(f)}" for f in findings
|
127 |
)
|
|
|
128 |
return f"# {title}\n\n{summary}\n\n{report_sections}"
|
graphrag/index.py
CHANGED
@@ -28,6 +28,7 @@ from graphrag.graph_extractor import GraphExtractor, DEFAULT_ENTITY_TYPES
|
|
28 |
from graphrag.mind_map_extractor import MindMapExtractor
|
29 |
from rag.nlp import rag_tokenizer
|
30 |
from rag.utils import num_tokens_from_string
|
|
|
31 |
|
32 |
|
33 |
def graph_merge(g1, g2):
|
@@ -94,7 +95,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en
|
|
94 |
chunks = []
|
95 |
for n, attr in graph.nodes(data=True):
|
96 |
if attr.get("rank", 0) == 0:
|
97 |
-
|
98 |
continue
|
99 |
chunk = {
|
100 |
"name_kwd": n,
|
@@ -136,7 +137,7 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: List[str], callback, en
|
|
136 |
mg = mindmap(_chunks).output
|
137 |
if not len(mg.keys()): return chunks
|
138 |
|
139 |
-
|
140 |
chunks.append(
|
141 |
{
|
142 |
"content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2),
|
|
|
28 |
from graphrag.mind_map_extractor import MindMapExtractor
|
29 |
from rag.nlp import rag_tokenizer
|
30 |
from rag.utils import num_tokens_from_string
|
31 |
+
from api.utils.log_utils import logger
|
32 |
|
33 |
|
34 |
def graph_merge(g1, g2):
|
|
|
95 |
chunks = []
|
96 |
for n, attr in graph.nodes(data=True):
|
97 |
if attr.get("rank", 0) == 0:
|
98 |
+
logger.info(f"Ignore entity: {n}")
|
99 |
continue
|
100 |
chunk = {
|
101 |
"name_kwd": n,
|
|
|
137 |
mg = mindmap(_chunks).output
|
138 |
if not len(mg.keys()): return chunks
|
139 |
|
140 |
+
logger.info(json.dumps(mg, ensure_ascii=False, indent=2))
|
141 |
chunks.append(
|
142 |
{
|
143 |
"content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2),
|
graphrag/mind_map_extractor.py
CHANGED
@@ -18,7 +18,6 @@ import collections
|
|
18 |
import logging
|
19 |
import os
|
20 |
import re
|
21 |
-
import logging
|
22 |
import traceback
|
23 |
from concurrent.futures import ThreadPoolExecutor
|
24 |
from dataclasses import dataclass
|
@@ -30,6 +29,7 @@ from rag.llm.chat_model import Base as CompletionLLM
|
|
30 |
import markdown_to_json
|
31 |
from functools import reduce
|
32 |
from rag.utils import num_tokens_from_string
|
|
|
33 |
|
34 |
|
35 |
@dataclass
|
@@ -193,6 +193,6 @@ class MindMapExtractor:
|
|
193 |
gen_conf = {"temperature": 0.5}
|
194 |
response = self._llm.chat(text, [{"role": "user", "content": "Output:"}], gen_conf)
|
195 |
response = re.sub(r"```[^\n]*", "", response)
|
196 |
-
|
197 |
-
|
198 |
return self._todict(markdown_to_json.dictify(response))
|
|
|
18 |
import logging
|
19 |
import os
|
20 |
import re
|
|
|
21 |
import traceback
|
22 |
from concurrent.futures import ThreadPoolExecutor
|
23 |
from dataclasses import dataclass
|
|
|
29 |
import markdown_to_json
|
30 |
from functools import reduce
|
31 |
from rag.utils import num_tokens_from_string
|
32 |
+
from api.utils.log_utils import logger
|
33 |
|
34 |
|
35 |
@dataclass
|
|
|
193 |
gen_conf = {"temperature": 0.5}
|
194 |
response = self._llm.chat(text, [{"role": "user", "content": "Output:"}], gen_conf)
|
195 |
response = re.sub(r"```[^\n]*", "", response)
|
196 |
+
logger.info(response)
|
197 |
+
logger.info(self._todict(markdown_to_json.dictify(response)))
|
198 |
return self._todict(markdown_to_json.dictify(response))
|
intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py
CHANGED
@@ -2,7 +2,7 @@ import requests
|
|
2 |
from bridge.context import ContextType # Import Context, ContextType
|
3 |
from bridge.reply import Reply, ReplyType # Import Reply, ReplyType
|
4 |
from bridge import *
|
5 |
-
from
|
6 |
from plugins import Plugin, register # Import Plugin and register
|
7 |
from plugins.event import Event, EventContext, EventAction # Import event-related classes
|
8 |
|
@@ -76,7 +76,7 @@ class RAGFlowChat(Plugin):
|
|
76 |
logger.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}")
|
77 |
return f"Sorry, unable to connect to RAGFlow API (create conversation). HTTP status code: {response.status_code}"
|
78 |
except Exception as e:
|
79 |
-
logger.exception(
|
80 |
return f"Sorry, an internal error occurred: {str(e)}"
|
81 |
|
82 |
# Step 2: Send the message and get a reply
|
@@ -108,5 +108,5 @@ class RAGFlowChat(Plugin):
|
|
108 |
logger.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}")
|
109 |
return f"Sorry, unable to connect to RAGFlow API (get reply). HTTP status code: {response.status_code}"
|
110 |
except Exception as e:
|
111 |
-
logger.exception(
|
112 |
return f"Sorry, an internal error occurred: {str(e)}"
|
|
|
2 |
from bridge.context import ContextType # Import Context, ContextType
|
3 |
from bridge.reply import Reply, ReplyType # Import Reply, ReplyType
|
4 |
from bridge import *
|
5 |
+
from api.utils.log_utils import logger
|
6 |
from plugins import Plugin, register # Import Plugin and register
|
7 |
from plugins.event import Event, EventContext, EventAction # Import event-related classes
|
8 |
|
|
|
76 |
logger.error(f"[RAGFlowChat] HTTP error when creating conversation: {response.status_code}")
|
77 |
return f"Sorry, unable to connect to RAGFlow API (create conversation). HTTP status code: {response.status_code}"
|
78 |
except Exception as e:
|
79 |
+
logger.exception("[RAGFlowChat] Exception when creating conversation")
|
80 |
return f"Sorry, an internal error occurred: {str(e)}"
|
81 |
|
82 |
# Step 2: Send the message and get a reply
|
|
|
108 |
logger.error(f"[RAGFlowChat] HTTP error when getting answer: {response.status_code}")
|
109 |
return f"Sorry, unable to connect to RAGFlow API (get reply). HTTP status code: {response.status_code}"
|
110 |
except Exception as e:
|
111 |
+
logger.exception("[RAGFlowChat] Exception when getting answer")
|
112 |
return f"Sorry, an internal error occurred: {str(e)}"
|
rag/app/book.py
CHANGED
@@ -20,6 +20,7 @@ from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
|
20 |
tokenize_chunks
|
21 |
from rag.nlp import rag_tokenizer
|
22 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
|
|
23 |
|
24 |
|
25 |
class Pdf(PdfParser):
|
@@ -38,7 +39,7 @@ class Pdf(PdfParser):
|
|
38 |
start = timer()
|
39 |
self._layouts_rec(zoomin)
|
40 |
callback(0.67, "Layout analysis finished")
|
41 |
-
|
42 |
self._table_transformer_job(zoomin)
|
43 |
callback(0.68, "Table analysis finished")
|
44 |
self._text_merge()
|
|
|
20 |
tokenize_chunks
|
21 |
from rag.nlp import rag_tokenizer
|
22 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
23 |
+
from api.utils.log_utils import logger
|
24 |
|
25 |
|
26 |
class Pdf(PdfParser):
|
|
|
39 |
start = timer()
|
40 |
self._layouts_rec(zoomin)
|
41 |
callback(0.67, "Layout analysis finished")
|
42 |
+
logger.info("layouts: {}".format(timer() - start))
|
43 |
self._table_transformer_job(zoomin)
|
44 |
callback(0.68, "Table analysis finished")
|
45 |
self._text_merge()
|
rag/app/email.py
CHANGED
@@ -18,7 +18,7 @@ import re
|
|
18 |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
19 |
from deepdoc.parser import HtmlParser, TxtParser
|
20 |
from timeit import default_timer as timer
|
21 |
-
from
|
22 |
import io
|
23 |
|
24 |
|
@@ -86,7 +86,7 @@ def chunk(
|
|
86 |
)
|
87 |
|
88 |
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
89 |
-
|
90 |
# get the attachment info
|
91 |
for part in msg.iter_attachments():
|
92 |
content_disposition = part.get("Content-Disposition")
|
|
|
18 |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
19 |
from deepdoc.parser import HtmlParser, TxtParser
|
20 |
from timeit import default_timer as timer
|
21 |
+
from api.utils.log_utils import logger
|
22 |
import io
|
23 |
|
24 |
|
|
|
86 |
)
|
87 |
|
88 |
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
89 |
+
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
90 |
# get the attachment info
|
91 |
for part in msg.iter_attachments():
|
92 |
content_disposition = part.get("Content-Disposition")
|
rag/app/laws.py
CHANGED
@@ -21,7 +21,7 @@ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge,
|
|
21 |
make_colon_as_title, tokenize_chunks, docx_question_level
|
22 |
from rag.nlp import rag_tokenizer
|
23 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
24 |
-
from
|
25 |
|
26 |
|
27 |
class Docx(DocxParser):
|
@@ -122,8 +122,8 @@ class Pdf(PdfParser):
|
|
122 |
start = timer()
|
123 |
self._layouts_rec(zoomin)
|
124 |
callback(0.67, "Layout analysis finished")
|
125 |
-
|
126 |
-
|
127 |
self._naive_vertical_merge()
|
128 |
|
129 |
callback(0.8, "Text extraction finished")
|
|
|
21 |
make_colon_as_title, tokenize_chunks, docx_question_level
|
22 |
from rag.nlp import rag_tokenizer
|
23 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
24 |
+
from api.utils.log_utils import logger
|
25 |
|
26 |
|
27 |
class Docx(DocxParser):
|
|
|
122 |
start = timer()
|
123 |
self._layouts_rec(zoomin)
|
124 |
callback(0.67, "Layout analysis finished")
|
125 |
+
logger.info("layouts:".format(
|
126 |
+
))
|
127 |
self._naive_vertical_merge()
|
128 |
|
129 |
callback(0.8, "Text extraction finished")
|