File size: 2,986 Bytes
48f8356
 
 
 
f5d37e2
 
be5c707
 
4a44055
 
6321fb3
4a44055
40058f2
73aeabd
6bf3d15
73aeabd
 
 
 
 
 
ef6eea4
48f8356
69acd7a
1bf3861
7c99e18
6bf3d15
7c99e18
f4c633e
48f8356
ef6eea4
 
119eed3
2354351
119eed3
 
 
 
 
 
81ba50d
2354351
be2693c
48f8356
 
f5d37e2
 
119eed3
2354351
4517722
 
2354351
4517722
2354351
 
4517722
 
2354351
 
4517722
 
2354351
4517722
2354351
77e2e43
48f8356
0d08116
2354351
 
 
 
 
4517722
2354351
 
0521f67
 
2354351
 
0d08116
6471f80
48f8356
a446815
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import gradio as gr
from langchain.llms import HuggingFaceHub

llama_repo = os.getenv('HF_MODEL_LLAMA_REPO')
starchat_repo = os.getenv('HF_MODEL_STARCHAT_REPO')
bloom_repo = os.getenv('HF_MODEL_BLOOM_REPO')

llamma_template = """<s>[INST]<<SYS>>I want you to act as document language translator. You do translation {source} texts in document into then you return to me the translated document AND DO NOTHING ELSE.<</SYS>>[/INST]
[INST]Begin of the document:
{query}
End of the document.[/INST]
{target} translated document:
"""

starchat_template = """<|system|>I want you to act as document language translator. You do translation {source} texts in document into then you return to me the translated document AND DO NOTHING ELSE.<</SYS>>
Begin of the document:
{query}
End of the document<|end|>
<|assistant|>
{target} translated document:
"""

bloom_template = """Text translation.
{source} text:
<s>{query}</s>
{target} translated text:
<s>"""

model_kwargs={
            "max_new_tokens":2048,
            "temperature": 0.01,
            "truncate": 4096,
            "seed" : 42,
            "stop" : ["</s>","<|endoftext|>","<|end|>"],
            }

bloom_model_kwargs={
            "max_new_tokens":1000,
            "temperature": 0.01,
#            "truncate": 1512,
            "seed" : 42,
            "stop" : ["</s>","<|endoftext|>","<|end|>"],
            }

llm1 = HuggingFaceHub(repo_id=llama_repo, task="text-generation", model_kwargs=model_kwargs)
llm2 = HuggingFaceHub(repo_id=starchat_repo, task="text-generation", model_kwargs=model_kwargs)
llm3 = HuggingFaceHub(repo_id=bloom_repo, task="text-generation", model_kwargs=bloom_model_kwargs)

def split_text_into_chunks(text, chunk_size=1000):
    lines = text.split('\n')
    chunks = []
    chunk = ""
    for line in lines:
        # If adding the current line doesn't exceed the chunk size, add the line to the chunk
        if len(chunk) + len(line) <= chunk_size:
            chunk += line + '\n'
        else:
            # If adding the line exceeds chunk size, store the current chunk and start a new one
            chunks.append(chunk)
            chunk = line + '\n'
    # Don't forget the last chunk
    chunks.append(chunk)
    return chunks
    
def translation(source, target, text):
    response = ""
    chunks = split_text_into_chunks(text)
    for chunk in chunks:
        try:
            input_prompt = bloom_template.replace("{source}", source)
            input_prompt = input_prompt.replace("{target}", target)
            input_prompt = input_prompt.replace("{query}", chunk)
            stchunk = llm3(input_prompt)
            for eot in bloom_model_kwargs['stop']:
                stchunk = stchunk.replace(eot,"")
            response += stchunk + "\n"
        except Exception as e:
            print(f"ERROR: LLM show {e}")
    if response == "": response = text
    return response.strip()

gr.Interface(translation, inputs=["text","text","text"], outputs="text").launch()