TeslaZY commited on
Commit
624ceba
·
1 Parent(s): 0812124

Fix some bugs in text2sql.(#4279)(#4281) (#4280)

Browse files

Fix some bugs in text2sql.(#4279)(#4281)

### What problem does this PR solve?
- The incorrect results in parsing CSV files of the QA knowledge base in
the text2sql scenario. Process CSV files using the csv library. Decouple
CSV parsing from TXT parsing
- Most llm return results in markdown format ```sql query ```, Fix
execution error caused by LLM output SQLmarkdown format.### Type of
change
- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (2) hide show
  1. agent/component/exesql.py +8 -12
  2. rag/app/qa.py +34 -2
agent/component/exesql.py CHANGED
@@ -65,20 +65,16 @@ class ExeSQL(ComponentBase, ABC):
65
  self._loop += 1
66
 
67
  ans = self.get_input()
68
-
69
-
70
  ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
71
- if self._param.db_type == 'mssql':
72
- # improve the information extraction, most llm return results in markdown format ```sql query ```
73
- match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
74
- if match:
75
- ans = match.group(1) # Query content
76
- print(ans)
77
- else:
78
- print("no markdown")
79
- ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
80
  else:
81
- ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE)
 
82
  ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
83
  ans = re.sub(r';[^;]*$', r';', ans)
84
  if not ans:
 
65
  self._loop += 1
66
 
67
  ans = self.get_input()
 
 
68
  ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
69
+
70
+ # improve the information extraction, most llm return results in markdown format ```sql query ```
71
+ match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
72
+ if match:
73
+ ans = match.group(1) # Query content
74
+ print(ans)
 
 
 
75
  else:
76
+ print("no markdown")
77
+ ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
78
  ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
79
  ans = re.sub(r';[^;]*$', r';', ans)
80
  if not ans:
rag/app/qa.py CHANGED
@@ -12,6 +12,7 @@
12
  #
13
  import logging
14
  import re
 
15
  from copy import deepcopy
16
  from io import BytesIO
17
  from timeit import default_timer as timer
@@ -25,7 +26,6 @@ from docx import Document
25
  from PIL import Image
26
  from markdown import markdown
27
 
28
-
29
  class Excel(ExcelParser):
30
  def __call__(self, fnm, binary=None, callback=None):
31
  if not binary:
@@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
320
  res.append(beAdoc(deepcopy(doc), q, a, eng))
321
  return res
322
 
323
- elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
324
  callback(0.1, "Start to parse.")
325
  txt = get_text(filename, binary)
326
  lines = txt.split("\n")
@@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
359
 
360
  return res
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
363
  callback(0.1, "Start to parse.")
364
  pdf_parser = Pdf()
 
12
  #
13
  import logging
14
  import re
15
+ import csv
16
  from copy import deepcopy
17
  from io import BytesIO
18
  from timeit import default_timer as timer
 
26
  from PIL import Image
27
  from markdown import markdown
28
 
 
29
  class Excel(ExcelParser):
30
  def __call__(self, fnm, binary=None, callback=None):
31
  if not binary:
 
320
  res.append(beAdoc(deepcopy(doc), q, a, eng))
321
  return res
322
 
323
+ elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
324
  callback(0.1, "Start to parse.")
325
  txt = get_text(filename, binary)
326
  lines = txt.split("\n")
 
359
 
360
  return res
361
 
362
+ elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
363
+ callback(0.1, "Start to parse.")
364
+ txt = get_text(filename, binary)
365
+ lines = txt.split("\n")
366
+ delimiter = "\t" if any("\t" in line for line in lines) else ","
367
+
368
+ fails = []
369
+ question, answer = "", ""
370
+ res = []
371
+ reader = csv.reader(lines, delimiter=delimiter)
372
+
373
+ for i, row in enumerate(reader):
374
+ if len(row) != 2:
375
+ if question:
376
+ answer += "\n" + lines[i]
377
+ else:
378
+ fails.append(str(i + 1))
379
+ elif len(row) == 2:
380
+ if question and answer:
381
+ res.append(beAdoc(deepcopy(doc), question, answer, eng))
382
+ question, answer = row
383
+ if len(res) % 999 == 0:
384
+ callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
385
+ f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
386
+
387
+ if question:
388
+ res.append(beAdoc(deepcopy(doc), question, answer, eng))
389
+
390
+ callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
391
+ f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
392
+ return res
393
+
394
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
395
  callback(0.1, "Start to parse.")
396
  pdf_parser = Pdf()