KevinHuSh commited on
Commit
89444d3
·
1 Parent(s): 7d85666

fix github account login issue (#132)

Browse files
api/apps/user_app.py CHANGED
@@ -106,7 +106,9 @@ def github_callback():
106
  stat_logger.exception(e)
107
  return redirect("/?error=%s"%str(e))
108
  user = users[0]
 
109
  login_user(user)
 
110
  return redirect("/?auth=%s" % user.get_id())
111
 
112
 
 
106
  stat_logger.exception(e)
107
  return redirect("/?error=%s"%str(e))
108
  user = users[0]
109
+ user.access_token = get_uuid()
110
  login_user(user)
111
+ user.save()
112
  return redirect("/?auth=%s" % user.get_id())
113
 
114
 
deepdoc/parser/pdf_parser.py CHANGED
@@ -639,7 +639,7 @@ class HuParser:
639
  mink = ""
640
  minv = 1000000000
641
  for k, bxs in tbls.items():
642
- for b in bxs[:10]:
643
  if b.get("layout_type", "").find("caption") >= 0:
644
  continue
645
  y_dis = self._y_dis(c, b)
 
639
  mink = ""
640
  minv = 1000000000
641
  for k, bxs in tbls.items():
642
+ for b in bxs:
643
  if b.get("layout_type", "").find("caption") >= 0:
644
  continue
645
  y_dis = self._y_dis(c, b)
rag/app/manual.py CHANGED
@@ -62,9 +62,6 @@ class Pdf(PdfParser):
62
  for b in self.boxes:
63
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
64
 
65
- # merge chunks with the same bullets
66
- self._merge_with_same_bullet()
67
-
68
  # set pivot using the most frequent type of title,
69
  # then merge between 2 pivot
70
  bull = bullets_category([b["text"] for b in self.boxes])
@@ -79,7 +76,7 @@ class Pdf(PdfParser):
79
 
80
  sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
81
  for (img, rows), poss in tbls:
82
- sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss]))
83
 
84
  chunks = []
85
  last_sid = -2
 
62
  for b in self.boxes:
63
  b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
64
 
 
 
 
65
  # set pivot using the most frequent type of title,
66
  # then merge between 2 pivot
67
  bull = bullets_category([b["text"] for b in self.boxes])
 
76
 
77
  sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
78
  for (img, rows), poss in tbls:
79
+ sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
80
 
81
  chunks = []
82
  last_sid = -2
rag/app/qa.py CHANGED
@@ -11,6 +11,7 @@
11
  # limitations under the License.
12
  #
13
  import re
 
14
  from io import BytesIO
15
  from nltk import word_tokenize
16
  from openpyxl import load_workbook
@@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
93
  All the deformed lines will be ignored.
94
  Every pair of Q&A will be treated as a chunk.
95
  """
 
96
  res = []
 
 
 
 
97
  if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
98
  callback(0.1, "Start to parse.")
99
  excel_parser = Excel()
100
  for q, a in excel_parser(filename, binary, callback):
101
- res.append(beAdoc({}, q, a, excel_parser.is_english))
102
  return res
103
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
104
  callback(0.1, "Start to parse.")
@@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
113
  break
114
  txt += l
115
  lines = txt.split("\n")
116
- eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
117
  fails = []
118
  for i, line in enumerate(lines):
119
  arr = [l for l in line.split("\t") if len(l) > 1]
120
  if len(arr) != 2:
121
  fails.append(str(i))
122
  continue
123
- res.append(beAdoc({}, arr[0], arr[1], eng))
124
  if len(res) % 999 == 0:
125
  callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
126
  f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 
11
  # limitations under the License.
12
  #
13
  import re
14
+ from copy import deepcopy
15
  from io import BytesIO
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
 
94
  All the deformed lines will be ignored.
95
  Every pair of Q&A will be treated as a chunk.
96
  """
97
+ eng = lang.lower() == "english"
98
  res = []
99
+ doc = {
100
+ "docnm_kwd": filename,
101
+ "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
102
+ }
103
  if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
104
  callback(0.1, "Start to parse.")
105
  excel_parser = Excel()
106
  for q, a in excel_parser(filename, binary, callback):
107
+ res.append(beAdoc(deepcopy(doc), q, a, eng))
108
  return res
109
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
110
  callback(0.1, "Start to parse.")
 
119
  break
120
  txt += l
121
  lines = txt.split("\n")
122
+ #is_english([rmPrefix(l) for l in lines[:100]])
123
  fails = []
124
  for i, line in enumerate(lines):
125
  arr = [l for l in line.split("\t") if len(l) > 1]
126
  if len(arr) != 2:
127
  fails.append(str(i))
128
  continue
129
+ res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
130
  if len(res) % 999 == 0:
131
  callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
132
  f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
rag/nlp/__init__.py CHANGED
@@ -76,6 +76,7 @@ def is_english(texts):
76
 
77
  def tokenize(d, t, eng):
78
  d["content_with_weight"] = t
 
79
  if eng:
80
  t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
81
  d["content_ltks"] = " ".join([stemmer.stem(w)
 
76
 
77
  def tokenize(d, t, eng):
78
  d["content_with_weight"] = t
79
+ t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
80
  if eng:
81
  t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
82
  d["content_ltks"] = " ".join([stemmer.stem(w)
rag/nlp/query.py CHANGED
@@ -29,7 +29,7 @@ class EsQueryer:
29
  for t in arr:
30
  if not re.match(r"[a-zA-Z]+$", t):
31
  e += 1
32
- return e * 1. / len(arr) >= 0.8
33
 
34
  @staticmethod
35
  def rmWWW(txt):
 
29
  for t in arr:
30
  if not re.match(r"[a-zA-Z]+$", t):
31
  e += 1
32
+ return e * 1. / len(arr) >= 0.7
33
 
34
  @staticmethod
35
  def rmWWW(txt):