zhichyu commited on
Commit
9a73da2
·
1 Parent(s): b5eedaf

Always open text file for write with UTF-8 (#3688)

Browse files

### What problem does this PR solve?

Always open text file for write with UTF-8. Close #932

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/utils/file_utils.py CHANGED
@@ -146,7 +146,7 @@ def rewrite_yaml_conf(conf_path, config):
146
 
147
 
148
  def rewrite_json_file(filepath, json_data):
149
- with open(filepath, "w") as f:
150
  json.dump(json_data, f, indent=4, separators=(",", ": "))
151
  f.close()
152
 
 
146
 
147
 
148
  def rewrite_json_file(filepath, json_data):
149
+ with open(filepath, "w", encoding='utf-8') as f:
150
  json.dump(json_data, f, indent=4, separators=(",", ": "))
151
  f.close()
152
 
deepdoc/parser/resume/entities/schools.py CHANGED
@@ -11,7 +11,10 @@
11
  # limitations under the License.
12
  #
13
 
14
- import os, json,re,copy
 
 
 
15
  import pandas as pd
16
  current_file_path = os.path.dirname(os.path.abspath(__file__))
17
  TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
@@ -23,7 +26,7 @@ GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
23
  def loadRank(fnm):
24
  global TBL
25
  TBL["rank"] = 1000000
26
- with open(fnm, "r",encoding='UTF-8') as f:
27
  while True:
28
  l = f.readline()
29
  if not l:break
@@ -32,7 +35,7 @@ def loadRank(fnm):
32
  nm,rk = l[0].strip(),int(l[1])
33
  #assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
34
  TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
35
- except Exception as e:
36
  pass
37
 
38
 
 
11
  # limitations under the License.
12
  #
13
 
14
+ import os
15
+ import json
16
+ import re
17
+ import copy
18
  import pandas as pd
19
  current_file_path = os.path.dirname(os.path.abspath(__file__))
20
  TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
 
26
  def loadRank(fnm):
27
  global TBL
28
  TBL["rank"] = 1000000
29
+ with open(fnm, "r", encoding='utf-8') as f:
30
  while True:
31
  l = f.readline()
32
  if not l:break
 
35
  nm,rk = l[0].strip(),int(l[1])
36
  #assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
37
  TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
38
+ except Exception:
39
  pass
40
 
41
 
deepdoc/vision/t_ocr.py CHANGED
@@ -41,7 +41,7 @@ def main(args):
41
  "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
42
  img = draw_box(images[i], bxs, ["ocr"], 1.)
43
  img.save(outputs[i], quality=95)
44
- with open(outputs[i] + ".txt", "w+") as f:
45
  f.write("\n".join([o["text"] for o in bxs]))
46
 
47
 
 
41
  "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
42
  img = draw_box(images[i], bxs, ["ocr"], 1.)
43
  img.save(outputs[i], quality=95)
44
+ with open(outputs[i] + ".txt", "w+", encoding='utf-8') as f:
45
  f.write("\n".join([o["text"] for o in bxs]))
46
 
47
 
deepdoc/vision/t_recognizer.py CHANGED
@@ -50,7 +50,7 @@ def main(args):
50
  if args.mode.lower() == "tsr":
51
  #lyt = [t for t in lyt if t["type"] == "table column"]
52
  html = get_table_html(images[i], lyt, ocr)
53
- with open(outputs[i] + ".html", "w+") as f:
54
  f.write(html)
55
  lyt = [{
56
  "type": t["label"],
 
50
  if args.mode.lower() == "tsr":
51
  #lyt = [t for t in lyt if t["type"] == "table column"]
52
  html = get_table_html(images[i], lyt, ocr)
53
+ with open(outputs[i] + ".html", "w+", encoding='utf-8') as f:
54
  f.write(html)
55
  lyt = [{
56
  "type": t["label"],
rag/benchmark.py CHANGED
@@ -237,8 +237,8 @@ class Benchmark:
237
  scores = sorted(scores, key=lambda kk: kk[1])
238
  for score in scores[:10]:
239
  f.write('- text: ' + str(texts[score[0]]) + '\t qrel: ' + str(score[1]) + '\n')
240
- json.dump(qrels, open(os.path.join(file_path, dataset + '.qrels.json'), "w+"), indent=2)
241
- json.dump(run, open(os.path.join(file_path, dataset + '.run.json'), "w+"), indent=2)
242
  print(os.path.join(file_path, dataset + '_result.md'), 'Saved!')
243
 
244
  def __call__(self, dataset, file_path, miracl_corpus=''):
 
237
  scores = sorted(scores, key=lambda kk: kk[1])
238
  for score in scores[:10]:
239
  f.write('- text: ' + str(texts[score[0]]) + '\t qrel: ' + str(score[1]) + '\n')
240
+ json.dump(qrels, open(os.path.join(file_path, dataset + '.qrels.json'), "w+", encoding='utf-8'), indent=2)
241
+ json.dump(run, open(os.path.join(file_path, dataset + '.run.json'), "w+", encoding='utf-8'), indent=2)
242
  print(os.path.join(file_path, dataset + '_result.md'), 'Saved!')
243
 
244
  def __call__(self, dataset, file_path, miracl_corpus=''):