Spaces:

DevQuasar
/

TrashcanAI

Running

App Files Files Community

csabakecskemeti commited on Sep 6, 2024

Commit

71d0a2d

verified ·

1 Parent(s): 996b1d6

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -45

app.py CHANGED Viewed

@@ -3,89 +3,85 @@ import requests, json
 public_ip = '71.202.66.108'
-model = 'llama3.1:latest' #You can replace the model name if needed
 context = []
-import gradio as gr
-# ollama_serve = f"http://{mac_pro_ip}:11434/api/generate"
 ollama_serve = f"http://{public_ip}:11434/api/generate"
-#Call Ollama API
 def generate(prompt, context, top_k, top_p, temp):
     r = requests.post(ollama_serve,
-                     json={
-                         'model': model,
-                         'prompt': prompt,
-                         'context': context,
-                         'options':{
-                             'top_k': top_k,
-                             'temperature':top_p,
-                             'top_p': temp
-                         }
-                     },
-                     stream=True)
     r.raise_for_status()
     response = ""
     for line in r.iter_lines():
         body = json.loads(line)
         response_part = body.get('response', '')
-        print(response_part)
         if 'error' in body:
-            raise Exception(body['error'])
-        response += response_part
         if body.get('done', False):
             context = body.get('context', [])
-            return response, context
 def chat(input, chat_history, top_k, top_p, temp):
     chat_history = chat_history or []
     global context
-    output, context = generate(input, context, top_k, top_p, temp)
-    chat_history.append((input, output))
-    return chat_history, chat_history
-  #the first history in return history, history is meant to update the
-  #chatbot widget, and the second history is meant to update the state
-  #(which is used to maintain conversation history across interactions)
-#########################Gradio Code##########################
 block = gr.Blocks()
 with block:
-    gr.Markdown("""<h1><center> Trashcan AI </center></h1>
-    """)
-    gr.Markdown("""<h3><center> LLama3.1 hosted on a 2013 "Trashcan" Mac Pro with ollama </center></h3>
-    """)
     chatbot = gr.Chatbot()
     message = gr.Textbox(placeholder="Type here")
     state = gr.State()
     with gr.Row():
-        top_k = gr.Slider(0.0,100.0, label="top_k", value=40, info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)")
-        top_p = gr.Slider(0.0,1.0, label="top_p", value=0.9, info=" Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)")
-        temp = gr.Slider(0.0,2.0, label="temperature", value=0.8, info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)")
     submit = gr.Button("SEND")
     submit.click(chat, inputs=[message, state, top_k, top_p, temp], outputs=[chatbot, state])
 if __name__ == "__main__":
-    block.launch()

 public_ip = '71.202.66.108'
+model = 'llama3.1:latest'  # You can replace the model name if needed
 context = []
 ollama_serve = f"http://{public_ip}:11434/api/generate"
+# Call Ollama API
 def generate(prompt, context, top_k, top_p, temp):
     r = requests.post(ollama_serve,
+                      json={
+                          'model': model,
+                          'prompt': prompt,
+                          'context': context,
+                          'options': {
+                              'top_k': top_k,
+                              'temperature': top_p,
+                              'top_p': temp
+                          }
+                      },
+                      stream=True)
     r.raise_for_status()
     response = ""
     for line in r.iter_lines():
         body = json.loads(line)
         response_part = body.get('response', '')
         if 'error' in body:
+            yield f"Error: {body['error']}"
+            return
+        # Append token to the growing response and yield the entire response so far
+        if response_part:
+            response += response_part
+            yield response  # Yield the growing response incrementally
         if body.get('done', False):
             context = body.get('context', [])
+            return  # End the generator once done
 def chat(input, chat_history, top_k, top_p, temp):
     chat_history = chat_history or []
     global context
+    # Initialize the user input as part of the chat history
+    chat_history.append((input, ""))  # Add user input first
+    response = ""  # Initialize empty response
+    # Stream each part of the response as it's received
+    response_stream = generate(input, context, top_k, top_p, temp)
+    for response_part in response_stream:
+        response = response_part  # Keep updating with the new part of the response
+        # Update the latest assistant response (the second part of the tuple)
+        chat_history[-1] = (input, response)
+        yield chat_history, chat_history  # Yield the updated chat history
+######################### Gradio Code ##########################
 block = gr.Blocks()
 with block:
+    gr.Markdown("""<h1><center> Trashcan AI </center></h1>""")
+    gr.Markdown("""<h3><center> LLama3.1 hosted on a 2013 "Trashcan" Mac Pro with ollama </center></h3>""")
     chatbot = gr.Chatbot()
     message = gr.Textbox(placeholder="Type here")
     state = gr.State()
     with gr.Row():
+        top_k = gr.Slider(0.0, 100.0, label="top_k", value=40, info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)")
+        top_p = gr.Slider(0.0, 1.0, label="top_p", value=0.9, info="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)")
+        temp = gr.Slider(0.0, 2.0, label="temperature", value=0.8, info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)")
     submit = gr.Button("SEND")
+    # Use .click() to trigger the response streaming
     submit.click(chat, inputs=[message, state, top_k, top_p, temp], outputs=[chatbot, state])
 if __name__ == "__main__":
+    block.launch()