|
import os |
|
from llama_cpp import Llama |
|
import gradio as gr |
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
path = hf_hub_download( |
|
repo_id="alexneakameni/qwen2.5-0.5b-json-chunker-gguf", |
|
filename="unsloth.Q4_K_M.gguf", |
|
local_dir="data/" |
|
) |
|
|
|
llm = Llama( |
|
model_path=path, |
|
n_gpu_layers=-1, |
|
n_ctx = 2048, |
|
verbose=False, |
|
n_threads=os.cpu_count() |
|
) |
|
|
|
|
|
EXPECTED_OUTPUT = """``` |
|
[ |
|
{"chunk": "Text of the first chunk...", "classification": "Brief classification of first chunk"}, |
|
{"chunk": "Text of the second chunk...", "classification": "Brief classification of second chunk"}, |
|
... |
|
] |
|
```""".strip() |
|
|
|
PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
You are an AI assistant tasked with analyzing and segmenting a given text into coherent chunks, each representing a main idea or topic. Your goal is to create a clear and structured segmentation of the text that helps readers navigate and understand the content, regardless of the subject. |
|
|
|
Follow these steps to complete the task: |
|
|
|
1. Read the entire text carefully to understand its content and structure. |
|
|
|
2. Identify the main ideas or key topics discussed in the text. |
|
|
|
3. Split the text into chunks, where each chunk corresponds to a single main idea. Aim for chunks that are typically one to two paragraphs long, ensuring they are neither too brief nor overly lengthy. |
|
|
|
4. For each chunk you create: |
|
a. Provide a short classification (in a few words) summarizing what the chunk is about. |
|
b. Ensure the classification reflects the chunk's main idea accurately. |
|
c. Keep the classification concise and informative, allowing readers to quickly grasp the chunk's topic. |
|
|
|
5. Ensure that all parts of the text are included without overlapping ideas between chunks. |
|
|
|
6. Make sure each chunk is self-contained and makes sense independently. |
|
|
|
7. Return your result as a `list` of `dictionaries` (as in python language), each with two keys: |
|
- "chunk": The text segment corresponding to the main idea. |
|
- "classification": A brief summary of the topic covered by the chunk. |
|
|
|
The output should follow this structure: |
|
|
|
<output> |
|
{EXPECTED_OUTPUT} |
|
</output> |
|
|
|
Additional guidelines: |
|
- Preserve the original language of the text within the chunks; do not alter the wording. |
|
- Ensure classifications are appropriate to the text's subject matter. |
|
- If the text includes terms or concepts unique to its subject, include them in the classifications when relevant. |
|
|
|
Remember, your objective is to create a clear and structured segmentation of the text that helps readers navigate and understand the content, regardless of the subject. Provide your answer in the specified JSON format inside <answer> tags. |
|
|
|
|
|
### Input: |
|
{TEXT} |
|
|
|
### Response:""" |
|
|
|
def process_text(text): |
|
|
|
input_text = PROMPT_TEMPLATE.format(EXPECTED_OUTPUT=EXPECTED_OUTPUT, TEXT=text) |
|
print(input_text) |
|
|
|
result = "" |
|
try: |
|
for i in llm(input_text, max_tokens=1024, stream=True, top_k=5): |
|
if i and "choices" in i and "text" in i["choices"][0]: |
|
result += i["choices"][0]["text"] |
|
yield result |
|
except Exception as ex: |
|
return str(ex) |
|
|
|
return result |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
demo = gr.Interface( |
|
fn=process_text, |
|
inputs=gr.Textbox( |
|
lines=10, |
|
placeholder="Enter the text you want to chunk...", |
|
label="Input Text" |
|
), |
|
outputs=gr.Text( |
|
lines=20, |
|
max_length=1024, |
|
label="Chunked Output" |
|
), |
|
title="Text Chunking with Qwen2.5", |
|
description="This app splits text into meaningful chunks and classifies them.", |
|
|
|
|
|
|
|
) |
|
demo.queue().launch(share=False) |
|
|