KevinHuSh
commited on
Commit
·
89444d3
1
Parent(s):
7d85666
fix github account login issue (#132)
Browse files- api/apps/user_app.py +2 -0
- deepdoc/parser/pdf_parser.py +1 -1
- rag/app/manual.py +1 -4
- rag/app/qa.py +9 -3
- rag/nlp/__init__.py +1 -0
- rag/nlp/query.py +1 -1
api/apps/user_app.py
CHANGED
@@ -106,7 +106,9 @@ def github_callback():
|
|
106 |
stat_logger.exception(e)
|
107 |
return redirect("/?error=%s"%str(e))
|
108 |
user = users[0]
|
|
|
109 |
login_user(user)
|
|
|
110 |
return redirect("/?auth=%s" % user.get_id())
|
111 |
|
112 |
|
|
|
106 |
stat_logger.exception(e)
|
107 |
return redirect("/?error=%s"%str(e))
|
108 |
user = users[0]
|
109 |
+
user.access_token = get_uuid()
|
110 |
login_user(user)
|
111 |
+
user.save()
|
112 |
return redirect("/?auth=%s" % user.get_id())
|
113 |
|
114 |
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -639,7 +639,7 @@ class HuParser:
|
|
639 |
mink = ""
|
640 |
minv = 1000000000
|
641 |
for k, bxs in tbls.items():
|
642 |
-
for b in bxs
|
643 |
if b.get("layout_type", "").find("caption") >= 0:
|
644 |
continue
|
645 |
y_dis = self._y_dis(c, b)
|
|
|
639 |
mink = ""
|
640 |
minv = 1000000000
|
641 |
for k, bxs in tbls.items():
|
642 |
+
for b in bxs:
|
643 |
if b.get("layout_type", "").find("caption") >= 0:
|
644 |
continue
|
645 |
y_dis = self._y_dis(c, b)
|
rag/app/manual.py
CHANGED
@@ -62,9 +62,6 @@ class Pdf(PdfParser):
|
|
62 |
for b in self.boxes:
|
63 |
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
64 |
|
65 |
-
# merge chunks with the same bullets
|
66 |
-
self._merge_with_same_bullet()
|
67 |
-
|
68 |
# set pivot using the most frequent type of title,
|
69 |
# then merge between 2 pivot
|
70 |
bull = bullets_category([b["text"] for b in self.boxes])
|
@@ -79,7 +76,7 @@ class Pdf(PdfParser):
|
|
79 |
|
80 |
sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
|
81 |
for (img, rows), poss in tbls:
|
82 |
-
sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss]))
|
83 |
|
84 |
chunks = []
|
85 |
last_sid = -2
|
|
|
62 |
for b in self.boxes:
|
63 |
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
64 |
|
|
|
|
|
|
|
65 |
# set pivot using the most frequent type of title,
|
66 |
# then merge between 2 pivot
|
67 |
bull = bullets_category([b["text"] for b in self.boxes])
|
|
|
76 |
|
77 |
sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
|
78 |
for (img, rows), poss in tbls:
|
79 |
+
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
80 |
|
81 |
chunks = []
|
82 |
last_sid = -2
|
rag/app/qa.py
CHANGED
@@ -11,6 +11,7 @@
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
import re
|
|
|
14 |
from io import BytesIO
|
15 |
from nltk import word_tokenize
|
16 |
from openpyxl import load_workbook
|
@@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
93 |
All the deformed lines will be ignored.
|
94 |
Every pair of Q&A will be treated as a chunk.
|
95 |
"""
|
|
|
96 |
res = []
|
|
|
|
|
|
|
|
|
97 |
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
98 |
callback(0.1, "Start to parse.")
|
99 |
excel_parser = Excel()
|
100 |
for q, a in excel_parser(filename, binary, callback):
|
101 |
-
res.append(beAdoc(
|
102 |
return res
|
103 |
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
104 |
callback(0.1, "Start to parse.")
|
@@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
113 |
break
|
114 |
txt += l
|
115 |
lines = txt.split("\n")
|
116 |
-
|
117 |
fails = []
|
118 |
for i, line in enumerate(lines):
|
119 |
arr = [l for l in line.split("\t") if len(l) > 1]
|
120 |
if len(arr) != 2:
|
121 |
fails.append(str(i))
|
122 |
continue
|
123 |
-
res.append(beAdoc(
|
124 |
if len(res) % 999 == 0:
|
125 |
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
126 |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
import re
|
14 |
+
from copy import deepcopy
|
15 |
from io import BytesIO
|
16 |
from nltk import word_tokenize
|
17 |
from openpyxl import load_workbook
|
|
|
94 |
All the deformed lines will be ignored.
|
95 |
Every pair of Q&A will be treated as a chunk.
|
96 |
"""
|
97 |
+
eng = lang.lower() == "english"
|
98 |
res = []
|
99 |
+
doc = {
|
100 |
+
"docnm_kwd": filename,
|
101 |
+
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
102 |
+
}
|
103 |
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
104 |
callback(0.1, "Start to parse.")
|
105 |
excel_parser = Excel()
|
106 |
for q, a in excel_parser(filename, binary, callback):
|
107 |
+
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
108 |
return res
|
109 |
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
110 |
callback(0.1, "Start to parse.")
|
|
|
119 |
break
|
120 |
txt += l
|
121 |
lines = txt.split("\n")
|
122 |
+
#is_english([rmPrefix(l) for l in lines[:100]])
|
123 |
fails = []
|
124 |
for i, line in enumerate(lines):
|
125 |
arr = [l for l in line.split("\t") if len(l) > 1]
|
126 |
if len(arr) != 2:
|
127 |
fails.append(str(i))
|
128 |
continue
|
129 |
+
res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
|
130 |
if len(res) % 999 == 0:
|
131 |
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
132 |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
rag/nlp/__init__.py
CHANGED
@@ -76,6 +76,7 @@ def is_english(texts):
|
|
76 |
|
77 |
def tokenize(d, t, eng):
|
78 |
d["content_with_weight"] = t
|
|
|
79 |
if eng:
|
80 |
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
81 |
d["content_ltks"] = " ".join([stemmer.stem(w)
|
|
|
76 |
|
77 |
def tokenize(d, t, eng):
|
78 |
d["content_with_weight"] = t
|
79 |
+
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
80 |
if eng:
|
81 |
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
82 |
d["content_ltks"] = " ".join([stemmer.stem(w)
|
rag/nlp/query.py
CHANGED
@@ -29,7 +29,7 @@ class EsQueryer:
|
|
29 |
for t in arr:
|
30 |
if not re.match(r"[a-zA-Z]+$", t):
|
31 |
e += 1
|
32 |
-
return e * 1. / len(arr) >= 0.
|
33 |
|
34 |
@staticmethod
|
35 |
def rmWWW(txt):
|
|
|
29 |
for t in arr:
|
30 |
if not re.match(r"[a-zA-Z]+$", t):
|
31 |
e += 1
|
32 |
+
return e * 1. / len(arr) >= 0.7
|
33 |
|
34 |
@staticmethod
|
35 |
def rmWWW(txt):
|