Spaces:

retopara
/

ragflow

Build error

WangBaoling KevinHuSh commited on Jun 21, 2024

Commit

77b7e10

1 Parent(s): acec9a8

feat: support json file (#1217)

### What problem does this PR solve?

feat: support json file.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: KevinHuSh <[email protected]>

Files changed (4) hide show

deepdoc/parser/__init__.py +1 -0
deepdoc/parser/json_parser.py +116 -0
rag/app/naive.py +7 -1
rag/nlp/__init__.py +2 -0

deepdoc/parser/__init__.py CHANGED Viewed

@@ -16,3 +16,4 @@ from .docx_parser import RAGFlowDocxParser as DocxParser
 from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser

 from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
+from .json_parser import RAGFlowJsonParser as JsonParser

deepdoc/parser/json_parser.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# -*- coding: utf-8 -*-
+# The following documents are mainly referenced, and only adaptation modifications have been made
+# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
+import json
+from typing import Any, Dict, List, Optional
+from rag.nlp import find_codec
+class RAGFlowJsonParser:
+    def __init__(
+        self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
+    ):
+        super().__init__()
+        self.max_chunk_size = max_chunk_size * 2
+        self.min_chunk_size = (
+            min_chunk_size
+            if min_chunk_size is not None
+            else max(max_chunk_size - 200, 50)
+        )
+    def __call__(self, binary):
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+        json_data = json.loads(txt)
+        chunks = self.split_json(json_data, True)
+        sections = [json.dumps(l, ensure_ascii=False) for l in chunks if l]
+        return sections
+    @staticmethod
+    def _json_size(data: Dict) -> int:
+        """Calculate the size of the serialized JSON object."""
+        return len(json.dumps(data, ensure_ascii=False))
+    @staticmethod
+    def _set_nested_dict(d: Dict, path: List[str], value: Any) -> None:
+        """Set a value in a nested dictionary based on the given path."""
+        for key in path[:-1]:
+            d = d.setdefault(key, {})
+        d[path[-1]] = value
+    def _list_to_dict_preprocessing(self, data: Any) -> Any:
+        if isinstance(data, dict):
+            # Process each key-value pair in the dictionary
+            return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            # Convert the list to a dictionary with index-based keys
+            return {
+                str(i): self._list_to_dict_preprocessing(item)
+                for i, item in enumerate(data)
+            }
+        else:
+            # Base case: the item is neither a dict nor a list, so return it unchanged
+            return data
+    def _json_split(
+        self,
+        data: Dict[str, Any],
+        current_path: Optional[List[str]] = None,
+        chunks: Optional[List[Dict]] = None,
+    ) -> List[Dict]:
+        """
+        Split json into maximum size dictionaries while preserving structure.
+        """
+        current_path = current_path or []
+        chunks = chunks or [{}]
+        if isinstance(data, dict):
+            for key, value in data.items():
+                new_path = current_path + [key]
+                chunk_size = self._json_size(chunks[-1])
+                size = self._json_size({key: value})
+                remaining = self.max_chunk_size - chunk_size
+                if size < remaining:
+                    # Add item to current chunk
+                    self._set_nested_dict(chunks[-1], new_path, value)
+                else:
+                    if chunk_size >= self.min_chunk_size:
+                        # Chunk is big enough, start a new chunk
+                        chunks.append({})
+                    # Iterate
+                    self._json_split(value, new_path, chunks)
+        else:
+            # handle single item
+            self._set_nested_dict(chunks[-1], current_path, data)
+        return chunks
+    def split_json(
+        self,
+        json_data: Dict[str, Any],
+        convert_lists: bool = False,
+    ) -> List[Dict]:
+        """Splits JSON into a list of JSON chunks"""
+        if convert_lists:
+            chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
+        else:
+            chunks = self._json_split(json_data)
+        # Remove the last chunk if it's empty
+        if not chunks[-1]:
+            chunks.pop()
+        return chunks
+    def split_text(
+        self,
+        json_data: Dict[str, Any],
+        convert_lists: bool = False,
+        ensure_ascii: bool = True,
+    ) -> List[str]:
+        """Splits JSON into a list of JSON formatted strings"""
+        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
+        # Convert to string
+        return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]

rag/app/naive.py CHANGED Viewed

@@ -17,7 +17,7 @@ from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
-from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
@@ -167,6 +167,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
         sections = [(l, "") for l in sections if l]
         callback(0.8, "Finish parsing.")
     elif re.search(r"\.doc$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         binary = BytesIO(binary)

 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
         sections = [(l, "") for l in sections if l]
         callback(0.8, "Finish parsing.")
+    elif re.search(r"\.json$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = JsonParser(parser_config.get("chunk_token_num", 128))(binary)
+        sections = [(l, "") for l in sections if l]
+        callback(0.8, "Finish parsing.")
     elif re.search(r"\.doc$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         binary = BytesIO(binary)

rag/nlp/__init__.py CHANGED Viewed

@@ -471,7 +471,9 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
         tnum = num_tokens_from_string(t)
         if tnum < 8:
             pos = ""
         if tk_nums[-1] > chunk_token_num:
             if t.find(pos) < 0:
                 t += pos
             cks.append(t)

         tnum = num_tokens_from_string(t)
         if tnum < 8:
             pos = ""
+        # Ensure that the length of the merged chunk does not exceed chunk_token_num
         if tk_nums[-1] > chunk_token_num:
             if t.find(pos) < 0:
                 t += pos
             cks.append(t)