Kevin Hu commited on
Commit
7b6896b
·
1 Parent(s): 77dc93a

refine error log while chunking (#1937)

Browse files

### What problem does this PR solve?



### Type of change

- [x] Refactoring

api/apps/document_app.py CHANGED
@@ -501,7 +501,9 @@ def upload_and_parse():
501
  "callback": dummy,
502
  "parser_config": parser_config,
503
  "from_page": 0,
504
- "to_page": 100000
 
 
505
  }
506
  threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
507
 
 
501
  "callback": dummy,
502
  "parser_config": parser_config,
503
  "from_page": 0,
504
+ "to_page": 100000,
505
+ "tenant_id": kb.tenant_id,
506
+ "lang": kb.language
507
  }
508
  threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
509
 
rag/svr/task_executor.py CHANGED
@@ -146,27 +146,32 @@ def build(row):
146
  binary = get_minio_binary(bucket, name)
147
  cron_logger.info(
148
  "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
149
- cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
150
- to_page=row["to_page"], lang=row["language"], callback=callback,
151
- kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
152
- cron_logger.info(
153
- "Chunkking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
154
  except TimeoutError as e:
155
- callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
156
  cron_logger.error(
157
- "Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"]))
158
  return
159
  except Exception as e:
160
  if re.search("(No such file|not found)", str(e)):
161
- callback(-1, "Can not find file <%s>" % row["name"])
162
  else:
163
- callback(-1, f"Internal server error: %s" %
164
  str(e).replace("'", ""))
165
  traceback.print_exc()
 
166
 
 
 
 
 
 
 
 
 
 
167
  cron_logger.error(
168
- "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
169
-
170
  return
171
 
172
  docs = []
 
146
  binary = get_minio_binary(bucket, name)
147
  cron_logger.info(
148
  "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
 
 
 
 
 
149
  except TimeoutError as e:
150
+ callback(-1, f"Internal server error: Fetch file from minio timeout. Could you try it again.")
151
  cron_logger.error(
152
+ "Minio {}/{}: Fetch file from minio timeout.".format(row["location"], row["name"]))
153
  return
154
  except Exception as e:
155
  if re.search("(No such file|not found)", str(e)):
156
+ callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
157
  else:
158
+ callback(-1, f"Get file from minio: %s" %
159
  str(e).replace("'", ""))
160
  traceback.print_exc()
161
+ return
162
 
163
+ try:
164
+ cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
165
+ to_page=row["to_page"], lang=row["language"], callback=callback,
166
+ kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
167
+ cron_logger.info(
168
+ "Chunking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
169
+ except Exception as e:
170
+ callback(-1, f"Internal server error while chunking: %s" %
171
+ str(e).replace("'", ""))
172
  cron_logger.error(
173
+ "Chunking {}/{}: {}".format(row["location"], row["name"], str(e)))
174
+ traceback.print_exc()
175
  return
176
 
177
  docs = []