WangBaoling KevinHuSh commited on
Commit
77b7e10
·
1 Parent(s): acec9a8

feat: support json file (#1217)

Browse files

### What problem does this PR solve?

feat: support json file.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: KevinHuSh <[email protected]>

deepdoc/parser/__init__.py CHANGED
@@ -16,3 +16,4 @@ from .docx_parser import RAGFlowDocxParser as DocxParser
16
  from .excel_parser import RAGFlowExcelParser as ExcelParser
17
  from .ppt_parser import RAGFlowPptParser as PptParser
18
  from .html_parser import RAGFlowHtmlParser as HtmlParser
 
 
16
  from .excel_parser import RAGFlowExcelParser as ExcelParser
17
  from .ppt_parser import RAGFlowPptParser as PptParser
18
  from .html_parser import RAGFlowHtmlParser as HtmlParser
19
+ from .json_parser import RAGFlowJsonParser as JsonParser
deepdoc/parser/json_parser.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # The following documents are mainly referenced, and only adaptation modifications have been made
3
+ # from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
4
+
5
+ import json
6
+ from typing import Any, Dict, List, Optional
7
+ from rag.nlp import find_codec
8
+
9
+ class RAGFlowJsonParser:
10
+ def __init__(
11
+ self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
12
+ ):
13
+ super().__init__()
14
+ self.max_chunk_size = max_chunk_size * 2
15
+ self.min_chunk_size = (
16
+ min_chunk_size
17
+ if min_chunk_size is not None
18
+ else max(max_chunk_size - 200, 50)
19
+ )
20
+
21
+ def __call__(self, binary):
22
+ encoding = find_codec(binary)
23
+ txt = binary.decode(encoding, errors="ignore")
24
+ json_data = json.loads(txt)
25
+ chunks = self.split_json(json_data, True)
26
+ sections = [json.dumps(l, ensure_ascii=False) for l in chunks if l]
27
+ return sections
28
+
29
+ @staticmethod
30
+ def _json_size(data: Dict) -> int:
31
+ """Calculate the size of the serialized JSON object."""
32
+ return len(json.dumps(data, ensure_ascii=False))
33
+
34
+ @staticmethod
35
+ def _set_nested_dict(d: Dict, path: List[str], value: Any) -> None:
36
+ """Set a value in a nested dictionary based on the given path."""
37
+ for key in path[:-1]:
38
+ d = d.setdefault(key, {})
39
+ d[path[-1]] = value
40
+
41
+ def _list_to_dict_preprocessing(self, data: Any) -> Any:
42
+ if isinstance(data, dict):
43
+ # Process each key-value pair in the dictionary
44
+ return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
45
+ elif isinstance(data, list):
46
+ # Convert the list to a dictionary with index-based keys
47
+ return {
48
+ str(i): self._list_to_dict_preprocessing(item)
49
+ for i, item in enumerate(data)
50
+ }
51
+ else:
52
+ # Base case: the item is neither a dict nor a list, so return it unchanged
53
+ return data
54
+
55
+ def _json_split(
56
+ self,
57
+ data: Dict[str, Any],
58
+ current_path: Optional[List[str]] = None,
59
+ chunks: Optional[List[Dict]] = None,
60
+ ) -> List[Dict]:
61
+ """
62
+ Split json into maximum size dictionaries while preserving structure.
63
+ """
64
+ current_path = current_path or []
65
+ chunks = chunks or [{}]
66
+ if isinstance(data, dict):
67
+ for key, value in data.items():
68
+ new_path = current_path + [key]
69
+ chunk_size = self._json_size(chunks[-1])
70
+ size = self._json_size({key: value})
71
+ remaining = self.max_chunk_size - chunk_size
72
+
73
+ if size < remaining:
74
+ # Add item to current chunk
75
+ self._set_nested_dict(chunks[-1], new_path, value)
76
+ else:
77
+ if chunk_size >= self.min_chunk_size:
78
+ # Chunk is big enough, start a new chunk
79
+ chunks.append({})
80
+
81
+ # Iterate
82
+ self._json_split(value, new_path, chunks)
83
+ else:
84
+ # handle single item
85
+ self._set_nested_dict(chunks[-1], current_path, data)
86
+ return chunks
87
+
88
+ def split_json(
89
+ self,
90
+ json_data: Dict[str, Any],
91
+ convert_lists: bool = False,
92
+ ) -> List[Dict]:
93
+ """Splits JSON into a list of JSON chunks"""
94
+
95
+ if convert_lists:
96
+ chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
97
+ else:
98
+ chunks = self._json_split(json_data)
99
+
100
+ # Remove the last chunk if it's empty
101
+ if not chunks[-1]:
102
+ chunks.pop()
103
+ return chunks
104
+
105
+ def split_text(
106
+ self,
107
+ json_data: Dict[str, Any],
108
+ convert_lists: bool = False,
109
+ ensure_ascii: bool = True,
110
+ ) -> List[str]:
111
+ """Splits JSON into a list of JSON formatted strings"""
112
+
113
+ chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
114
+
115
+ # Convert to string
116
+ return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
rag/app/naive.py CHANGED
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
  from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
- from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser
21
  from rag.settings import cron_logger
22
  from rag.utils import num_tokens_from_string
23
 
@@ -167,6 +167,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
167
  sections = [(l, "") for l in sections if l]
168
  callback(0.8, "Finish parsing.")
169
 
 
 
 
 
 
 
170
  elif re.search(r"\.doc$", filename, re.IGNORECASE):
171
  callback(0.1, "Start to parse.")
172
  binary = BytesIO(binary)
 
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
  from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
+ from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
21
  from rag.settings import cron_logger
22
  from rag.utils import num_tokens_from_string
23
 
 
167
  sections = [(l, "") for l in sections if l]
168
  callback(0.8, "Finish parsing.")
169
 
170
+ elif re.search(r"\.json$", filename, re.IGNORECASE):
171
+ callback(0.1, "Start to parse.")
172
+ sections = JsonParser(parser_config.get("chunk_token_num", 128))(binary)
173
+ sections = [(l, "") for l in sections if l]
174
+ callback(0.8, "Finish parsing.")
175
+
176
  elif re.search(r"\.doc$", filename, re.IGNORECASE):
177
  callback(0.1, "Start to parse.")
178
  binary = BytesIO(binary)
rag/nlp/__init__.py CHANGED
@@ -471,7 +471,9 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
471
  tnum = num_tokens_from_string(t)
472
  if tnum < 8:
473
  pos = ""
 
474
  if tk_nums[-1] > chunk_token_num:
 
475
  if t.find(pos) < 0:
476
  t += pos
477
  cks.append(t)
 
471
  tnum = num_tokens_from_string(t)
472
  if tnum < 8:
473
  pos = ""
474
+ # Ensure that the length of the merged chunk does not exceed chunk_token_num
475
  if tk_nums[-1] > chunk_token_num:
476
+
477
  if t.find(pos) < 0:
478
  t += pos
479
  cks.append(t)