jinhai-2012 Kevin Hu commited on
Commit
f567bc7
·
1 Parent(s): e023933

Move clk100k_base tokenizer to docker image (#3411)

Browse files

### What problem does this PR solve?

Move the tiktoken of cl100k_base into docker image

issue: #3338

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>

Files changed (4) hide show
  1. Dockerfile +3 -0
  2. Dockerfile.slim +3 -0
  3. download_deps.py +4 -0
  4. rag/utils/__init__.py +5 -4
Dockerfile CHANGED
@@ -119,6 +119,9 @@ COPY nltk_data /root/nltk_data
119
  COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./
120
  ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar"
121
 
 
 
 
122
  # Copy compiled web pages
123
  COPY --from=builder /ragflow/web/dist /ragflow/web/dist
124
 
 
119
  COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./
120
  ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar"
121
 
122
+ # Copy cl100k_base
123
+ COPY 9b5ad71b2ce5302211f9c61530b329a4922fc6a4 ./
124
+
125
  # Copy compiled web pages
126
  COPY --from=builder /ragflow/web/dist /ragflow/web/dist
127
 
Dockerfile.slim CHANGED
@@ -112,6 +112,9 @@ COPY nltk_data /root/nltk_data
112
  COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./
113
  ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar"
114
 
 
 
 
115
  # Copy compiled web pages
116
  COPY --from=builder /ragflow/web/dist /ragflow/web/dist
117
 
 
112
  COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./
113
  ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar"
114
 
115
+ # Copy cl100k_base
116
+ COPY 9b5ad71b2ce5302211f9c61530b329a4922fc6a4 ./
117
+
118
  # Copy compiled web pages
119
  COPY --from=builder /ragflow/web/dist /ragflow/web/dist
120
 
download_deps.py CHANGED
@@ -1,4 +1,5 @@
1
  #!/usr/bin/env python3
 
2
 
3
  from huggingface_hub import snapshot_download
4
  import nltk
@@ -9,6 +10,7 @@ urls = [
9
  "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb",
10
  "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar",
11
  "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5",
 
12
  ]
13
 
14
  repos = [
@@ -41,3 +43,5 @@ if __name__ == "__main__":
41
  for repo_id in repos:
42
  print(f"Downloading huggingface repo {repo_id}...")
43
  download_model(repo_id)
 
 
 
1
  #!/usr/bin/env python3
2
+ from os import rename
3
 
4
  from huggingface_hub import snapshot_download
5
  import nltk
 
10
  "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb",
11
  "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar",
12
  "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5",
13
+ "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
14
  ]
15
 
16
  repos = [
 
43
  for repo_id in repos:
44
  print(f"Downloading huggingface repo {repo_id}...")
45
  download_model(repo_id)
46
+
47
+ rename("cl100k_base.tiktoken", "9b5ad71b2ce5302211f9c61530b329a4922fc6a4")
rag/utils/__init__.py CHANGED
@@ -17,7 +17,7 @@
17
  import os
18
  import re
19
  import tiktoken
20
-
21
 
22
  def singleton(cls, *args, **kw):
23
  instances = {}
@@ -71,9 +71,10 @@ def findMaxTm(fnm):
71
  pass
72
  return m
73
 
74
-
75
- encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
76
-
 
77
 
78
  def num_tokens_from_string(string: str) -> int:
79
  """Returns the number of tokens in a text string."""
 
17
  import os
18
  import re
19
  import tiktoken
20
+ from api.utils.file_utils import get_project_base_directory
21
 
22
  def singleton(cls, *args, **kw):
23
  instances = {}
 
71
  pass
72
  return m
73
 
74
+ tiktoken_cache_dir = get_project_base_directory()
75
+ os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
76
+ # encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
77
+ encoder = tiktoken.get_encoding("cl100k_base")
78
 
79
  def num_tokens_from_string(string: str) -> int:
80
  """Returns the number of tokens in a text string."""