openGPT-X
/

Teuken-7B-instruct-research-v0.4

Text Generation

text-generation-inference

Model card Files Files and versions

mfromm commited on Nov 22, 2024

Commit

238e48d

·

verified ·

1 Parent(s): 9a67d89

Update gptx_tokenizer.py

Files changed (1) hide show

gptx_tokenizer.py +9 -24

gptx_tokenizer.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pathlib import Path
 from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
 import sentencepiece as spm
-from huggingface_hub import hf_hub_download, list_repo_files
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
@@ -64,29 +64,14 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
             f"<placeholder_tok_{i}>" for i in range(256)
         ]
-    def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Path:
-        if repo_id is None:
-            raise ValueError("repo_id must be provided if config_path is not a local file")
-        try:
-            # List all files in the repo
-            repo_files = list_repo_files(repo_id)
-            # Find the tokenizer config file
-            tokenizer_files = [f for f in repo_files if f.endswith('tokenizer_config.json')]
-            if not tokenizer_files:
-                raise FileNotFoundError(f"No tokenizer_config.json file found in repository {repo_id}")
-            # Use the first tokenizer_config.json file found
-            tokenizer_config_file = tokenizer_files[0]
-            print(f"Found tokenizer config file: {tokenizer_config_file}")
-            # Download the file
-            tokenizer_config_file_or_name = hf_hub_download(repo_id=repo_id, filename=tokenizer_config_file)
-            print(f"Downloaded tokenizer config file to: {tokenizer_config_file_or_name}")
-            return tokenizer_config_file_or_name
-        except Exception as e:
-            raise OSError(f"Failed to download tokenizer model: {str(e)}")
     def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
         """

 from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
 import sentencepiece as spm
+from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_cache
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
             f"<placeholder_tok_{i}>" for i in range(256)
         ]
+    def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
+        if not os.path.isfile(config_path):
+            config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
+            if not config_path:
+                config_path = self._download_config_from_hub(repo_id=repo_id)
+        return config_path
     def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
         """