File size: 4,069 Bytes
d54ec92
61cd51a
d54ec92
61cd51a
dd24401
d54ec92
 
 
 
0cc7583
d54ec92
 
 
a479d14
0cc7583
a10f12f
d54ec92
 
 
f031242
 
0cc7583
d424b19
0cc7583
 
 
 
a10f12f
0cc7583
d424b19
0cc7583
 
d54ec92
0cc7583
d424b19
0cc7583
 
f031242
d54ec92
 
 
43ab1fa
f031242
 
d54ec92
 
 
a479d14
d54ec92
 
 
 
 
 
 
 
f031242
 
edfd769
d54ec92
 
 
 
 
 
1d0916c
 
edfd769
d54ec92
 
43ab1fa
d54ec92
 
 
 
f031242
d54ec92
 
43ab1fa
d54ec92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from nltk import sent_tokenize
import openai
import re
import nltk
nltk.download('punkt')


class SynonymEditor:

    def __init__(self, api_key, model_engine, max_tokens, temperature, language):
        openai.api_key = api_key
        self.model_engine = model_engine
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.language = language
        self.quote = '__ZITIEREN__' if (language == 'de') else '__QUOTE__'

    # Play with the prompts here and change the return index to change and see the effect of the prompt on the output quality
    # Note that the longer the prompt, higher the token used and hence the billing
    def _get_prompt(self, sentence, few_shots):
        if (few_shots):
            if (self.language == "de"):
                prompt = 'Modernisiere den deutschen Text. Fasse direkte Reden NIE zusammen.\n' + \
                    few_shots + "\nEingang:" + sentence + " Ausgang:"
            else:
                prompt = "Replace exactly one word with a synonym while preserving the overall sentence structure and meaning.\n" + \
                    few_shots + "\nInput:" + sentence + " Output:"
        elif self.quote in sentence:
            if (self.language == "de"):
                prompt = 'Modernisiere den deutschen Text. Fasse direkte Reden NIE zusammen.\n'+sentence+'\n'
            else:
                prompt = "Replace exactly one word with a synonym while preserving __QUOTE__ in the following sentence:\n"+sentence+"\n"
        else:
            if (self.language == "de"):
                prompt = 'Modernisiere den deutschen Text. Fasse direkte Reden NIE zusammen.\n'+sentence+'\n'
            else:
                prompt = "Replace exactly one word with a synonym in the following sentence:\n"+sentence+"\n"
        return prompt

    # Call the OpenAI API here

    def __call_ai(self, sentence, few_shots):
        prompt = self._get_prompt(sentence, few_shots)
        print(prompt)
        response = openai.Completion.create(
            model=self.model_engine,
            prompt=prompt,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return self._post_process_sentence(response.choices[0].text.strip())

    # Split the paragraph to preserve quotation marks
    def _split_into_sentences(self, text, few_shots):
        if (few_shots == False):
            text = text.replace('"', self.quote)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        sentences = sent_tokenize(text)
        return sentences

    def _post_process_sentence(self, text):
        print(text)
        print("==============")
        return text.replace(self.quote, '"')

    # Preprocess the text, perform edit task and join back to get the original format
    def _edit_text(self, text, few_shots=False):
        edited_text = ""
        paragraphs = text.split("\n\n")
        edited_paragraphs = []
        for paragraph in paragraphs:
            sentences = self._split_into_sentences(paragraph, few_shots)
            edited_sentences = []
            for sentence in sentences:
                new_sentence = self.__call_ai(sentence, few_shots)
                edited_sentences.append(new_sentence)

            # join edited sentences to form an edited paragraph
            edited_paragraph = ' '.join(edited_sentences)
            edited_paragraphs.append(edited_paragraph)

        # join edited paragraphs to form edited text
        edited_text = '\n\n'.join(edited_paragraphs)

        return edited_text

    # File Read Write operation
    def edit_file(self, input_file, output_file):
        print("Opening File")
        with open(input_file, "r", encoding="utf8", errors="ignore") as f:
            text = f.read()
        print("Editing")
        edited_text = self._edit_text(text)
        print("Finishing up")
        with open(output_file, "w") as f:
            f.write(edited_text)
        print("Done!")