crystal-technologies
/

CRYSTAL-R1

Model card Files Files and versions Community

crystal-technologies commited on Nov 7, 2023

Commit

714d948

1 Parent(s): 4a51238

Upload 1653 files

Browse files

Files changed (20) hide show

.gitattributes +1 -0
CircumSpect/vqa/vqa.py +4 -5
Perceptrix/__init__.py +2 -0
Perceptrix/chat.py +123 -0
Perceptrix/engine.py +2 -63
Perceptrix/streamer.py +216 -0
README.md +3 -0
SoundScribe/speak.py +2 -2
SoundScribe/transcribe.py +12 -9
api_host.py +9 -8
crystal.py +7 -8
database/audio.wav +0 -0
database/chat_history.jsonl +2 -0
database/current_frame.jpg +0 -0
database/current_frame_vqa.jpg +0 -0
database/input.txt +1 -1
database/recording.wav +0 -0
internet.py +1 -1
robot.py +44 -0
utils.py +10 -10

.gitattributes CHANGED Viewed

@@ -4,3 +4,4 @@ Perceptrix/finetune/scripts/eval/local_data/reading_comprehension/narrative_qa.j
 Perceptrix/finetune/scripts/eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl filter=lfs diff=lfs merge=lfs -text
 SoundScribe/SpeakerID/tools/speech_data_explorer/screenshot.png filter=lfs diff=lfs merge=lfs -text
 SoundScribe/voices/Vatsal.wav filter=lfs diff=lfs merge=lfs -text

 Perceptrix/finetune/scripts/eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl filter=lfs diff=lfs merge=lfs -text
 SoundScribe/SpeakerID/tools/speech_data_explorer/screenshot.png filter=lfs diff=lfs merge=lfs -text
 SoundScribe/voices/Vatsal.wav filter=lfs diff=lfs merge=lfs -text
+database/audio.wav filter=lfs diff=lfs merge=lfs -text

CircumSpect/vqa/vqa.py CHANGED Viewed

@@ -4,7 +4,7 @@ from CircumSpect.vqa.conversation_obj import conv_templates_obj, SeparatorStyle_
 from CircumSpect.vqa.conversation_vqa import conv_templates, SeparatorStyle
 from transformers import AutoTokenizer, BitsAndBytesConfig
 from CircumSpect.vqa.utils import disable_torch_init
-from CircumSpect.vqa.streamer import TextStreamer
 from CircumSpect.vqa.model import *
 from utils import setup_device
 from io import BytesIO
@@ -95,6 +95,9 @@ if "mpt" in model_name.lower():
 else:
     roles = conv.roles
 def answer_question(question, image_file):
     conv = conv_templates[conv_mode].copy()
@@ -130,8 +133,6 @@ def answer_question(question, image_file):
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(
         keywords, tokenizer, input_ids)
-    streamer = TextStreamer(tokenizer, skip_prompt=True,
-                            skip_special_tokens=True)
     with open("./database/vlm-reply.txt", 'w') as clear_file:
         clear_file.write("")
@@ -193,8 +194,6 @@ def find_object_description(question, image_file):
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(
         keywords, tokenizer, input_ids)
-    streamer = TextStreamer(tokenizer, skip_prompt=True,
-                            skip_special_tokens=True)
     with torch.inference_mode():
         output_ids = model.generate(

 from CircumSpect.vqa.conversation_vqa import conv_templates, SeparatorStyle
 from transformers import AutoTokenizer, BitsAndBytesConfig
 from CircumSpect.vqa.utils import disable_torch_init
+from Perceptrix.streamer import TextStreamer
 from CircumSpect.vqa.model import *
 from utils import setup_device
 from io import BytesIO
 else:
     roles = conv.roles
+streamer = TextStreamer(tokenizer, skip_prompt=True,
+        skip_special_tokens=True, save_file="vlm-reply.txt")
 def answer_question(question, image_file):
     conv = conv_templates[conv_mode].copy()
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(
         keywords, tokenizer, input_ids)
     with open("./database/vlm-reply.txt", 'w') as clear_file:
         clear_file.write("")
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(
         keywords, tokenizer, input_ids)
     with torch.inference_mode():
         output_ids = model.generate(

Perceptrix/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from Perceptrix.engine import robotix, identify_objects_from_text, search_keyword
2	+ from chat import perceptrix

Perceptrix/chat.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig, GenerationConfig
+from Perceptrix.streamer import TextStreamer
+from utils import setup_device
+import torch
+import os
+model_name = os.environ.get('CHAT_MODEL')
+model_path = "models/CRYSTAL-chat" if model_name == None else model_name
+config = AutoConfig.from_pretrained(
+    model_path, trust_remote_code=True)
+device = setup_device()
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float32 if device == "cpu" else torch.bfloat16
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16,
+    config=config,
+    device_map="auto",
+    trust_remote_code=True,
+    low_cpu_mem_usage=True,
+    offload_folder="offloads",
+    quantization_config=bnb_config if str(device) != "cpu" else None,
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left"
+tokenizer = tokenizer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.eval()
+streamer = TextStreamer(tokenizer, skip_prompt=True,
+                        skip_special_tokens=True, save_file="reply.txt")
+def evaluate(
+    prompt='',
+    temperature=0.4,
+    top_p=0.65,
+    top_k=35,
+    repetition_penalty=1.1,
+    max_new_tokens=512,
+    **kwargs,
+):
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            streamer=streamer,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s, skip_special_tokens=True)
+    yield output.split("### Response:")[-1].strip()
+def predict(
+    inputs,
+    temperature=0.4,
+    top_p=0.65,
+    top_k=35,
+    repetition_penalty=1.1,
+    max_new_tokens=512,
+):
+    now_prompt = inputs
+    response = evaluate(
+        now_prompt, temperature, top_p, top_k, repetition_penalty, max_new_tokens, do_sample=True
+    )
+    for i in response:
+        print(i)
+        response = i
+    return response
+instructions = "You are Comprehensive Robotics Yielding Sophisticated Technology And Logistics (CRYSTAL), an AI robot developed by Vatsal Dutt to be the most advanced robot in the world. You will be provided with prompts and other information to help the user."
+def perceptrix(prompt):
+    prompt = instructions+"\n"+prompt
+    response = predict(
+        inputs=prompt, temperature=0.2, top_p=0.9, max_new_tokens=512
+    )
+    spl_tokens = ["<|im_start|>", "<|im_end|>"]
+    clean_prompt = prompt.replace(spl_tokens[0], "").replace(spl_tokens[1], "")
+    return response[len(clean_prompt):]
+if __name__ == "__main__":
+    history = ""
+    while True:
+        user_input = input("User: ")
+        user_input = "<|im_start|>User\n"+user_input+"<|im_end|>\n<|im_start|>CRYSTAL\n"
+        result = perceptrix(history+user_input)
+        history += user_input + result + "<|im_end|>\n"

Perceptrix/engine.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
-from Perceptrix.callbacks import Iteratorize, Stream
 from utils import setup_device
-import transformers
 import torch
 import tqdm
 import os
@@ -10,9 +8,6 @@ model_name = os.environ.get('LLM_MODEL')
 model_path = "models/CRYSTAL-instruct" if model_name == None else model_name
-config = transformers.AutoConfig.from_pretrained(
-    model_name, trust_remote_code=True)
 device = setup_device()
 bnb_config = BitsAndBytesConfig(
@@ -25,7 +20,6 @@ bnb_config = BitsAndBytesConfig(
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16,
-    config=config,
     device_map="auto",
     trust_remote_code=True,
     low_cpu_mem_usage=True,
@@ -36,6 +30,7 @@ model = AutoModelForCausalLM.from_pretrained(
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     trust_remote_code=True,
 )
 PROMPT = '''### Instruction:
@@ -52,10 +47,6 @@ tokenizer = tokenizer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.eval()
-INSTRUCTION_KEY = "### Instruction:"
-RESPONSE_KEY = "### Response:"
-END_KEY = "### End"
 def evaluate(
     prompt='',
@@ -76,39 +67,6 @@ def evaluate(
         repetition_penalty=repetition_penalty,
         **kwargs,
     )
-    generate_params = {
-        "input_ids": input_ids,
-        "generation_config": generation_config,
-        "return_dict_in_generate": True,
-        "output_scores": True,
-        "max_new_tokens": max_new_tokens,
-    }
-    if stream_output:
-        def generate_with_callback(callback=None, **kwargs):
-            kwargs.setdefault(
-                "stopping_criteria", transformers.StoppingCriteriaList()
-            )
-            kwargs["stopping_criteria"].append(
-                Stream(callback_func=callback)
-            )
-            with torch.no_grad():
-                model.generate(**kwargs)
-        def generate_with_streaming(**kwargs):
-            return Iteratorize(
-                generate_with_callback, kwargs, callback=None
-            )
-        with generate_with_streaming(**generate_params) as generator:
-            for output in generator:
-                decoded_output = tokenizer.decode(output)
-                if output[-1] in [tokenizer.eos_token_id]:
-                    break
-                yield decoded_output.split("### Response:")[-1].strip()
-        return
     with torch.no_grad():
         generation_output = model.generate(
@@ -151,21 +109,6 @@ def run_instruction(
     return response
-def perceptrix(prompt, stop=None):
-    instructions = """You are Comprehensive Robotics Yielding Sophisticated Technology And Logistics (CRYSTAL), an AI robot developed by Vatsal Dutt to be the most advanced robot in the world. You will be provided with prompts and other information to help the user."""
-    answer = ''.join(run_instruction(
-        instructions,
-        "User: "+prompt+"\nCRYSTAL:",
-        temperature=0.6,
-        top_p=0.6,
-        top_k=200,
-        repetition_penalty=1.1,
-        max_new_tokens=256,
-        stream_output=False,
-    ))
-    return answer
 def search_keyword(prompt):
     instructions = """Prompt:Time: Fri, 23 August 2023 2:30PM\nWeather: 73F\nHow many friends have I told you about?
 Search Keyword:Friends
@@ -277,8 +220,4 @@ if object_distance > 10:
         max_new_tokens=256,
         stream_output=False,
     ))
-    return answer
-if __name__ == "__main__":
-    perceptrix("Hello! How are you?")

 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
 from utils import setup_device
 import torch
 import tqdm
 import os
 model_path = "models/CRYSTAL-instruct" if model_name == None else model_name
 device = setup_device()
 bnb_config = BitsAndBytesConfig(
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16,
     device_map="auto",
     trust_remote_code=True,
     low_cpu_mem_usage=True,
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     trust_remote_code=True,
+    use_fast=True,
 )
 PROMPT = '''### Instruction:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.eval()
 def evaluate(
     prompt='',
         repetition_penalty=repetition_penalty,
         **kwargs,
     )
     with torch.no_grad():
         generation_output = model.generate(
     return response
 def search_keyword(prompt):
     instructions = """Prompt:Time: Fri, 23 August 2023 2:30PM\nWeather: 73F\nHow many friends have I told you about?
 Search Keyword:Friends
         max_new_tokens=256,
         stream_output=False,
     ))
+    return answer

Perceptrix/streamer.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from queue import Queue
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from transformers.models.auto import AutoTokenizer
+class BaseStreamer:
+    """
+    Base class from which `.generate()` streamers should inherit.
+    """
+    def put(self, value):
+        """Function that is called by `.generate()` to push new tokens"""
+        raise NotImplementedError()
+    def end(self):
+        """Function that is called by `.generate()` to signal the end of generation"""
+        raise NotImplementedError()
+class TextStreamer(BaseStreamer):
+    """
+    Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.
+    <Tip warning={true}>
+    The API for the streamer classes is still under development and may change in the future.
+    </Tip>
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+    Examples:
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextStreamer(tok)
+        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
+        >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, save_file="reply.txt", **decode_kwargs):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.decode_kwargs = decode_kwargs
+        # variables used in the streaming process
+        self.token_cache = []
+        self.print_len = 0
+        self.next_tokens_are_prompt = True
+        self.save_file = save_file
+    def put(self, value):
+        """
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+        # Add the new token to the cache and decodes the entire thing.
+        self.token_cache.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+        # After the symbol for a new line, we flush the cache.
+        if text.endswith("\n"):
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        # If the last token is a CJK character, we print the characters.
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len :]
+            self.print_len += len(printable_text)
+        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+        # which may change with the subsequent token -- there are probably smarter ways to do this!)
+        else:
+            printable_text = text[self.print_len : text.rfind(" ") + 1]
+            self.print_len += len(printable_text)
+        self.on_finalized_text(printable_text)
+    def end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        if len(self.token_cache) > 0:
+            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = ""
+        self.next_tokens_are_prompt = True
+        self.on_finalized_text(printable_text, stream_end=True)
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Prints the new text to stdout. If the stream is ending, also prints a newline."""
+        print(text, flush=True, end="" if not stream_end else None)
+        with open(f"./database/{self.save_file}", 'a') as reply:
+            reply.write(text)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+        return False
+class TextIteratorStreamer(TextStreamer):
+    """
+    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
+    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    Gradio demo).
+    <Tip warning={true}>
+    The API for the streamer classes is still under development and may change in the future.
+    </Tip>
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        timeout (`float`, *optional*):
+            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+            in `.generate()`, when it is called in a separate thread.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+    Examples:
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+        >>> from threading import Thread
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextIteratorStreamer(tok)
+        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        >>> generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
+        >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        >>> thread.start()
+        >>> generated_text = ""
+        >>> for new_text in streamer:
+        ...     generated_text += new_text
+        >>> generated_text
+        'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
+        ```
+    """
+    def __init__(
+        self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.text_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value

README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+Run crystal.py
+Train LLM `pip install -e .` inside finetuning folder
+Install Speaker Identification `pip install .` and `pip install -r requirements/requirements_lightning.txt requirements/requirements_asr.txt`

SoundScribe/speak.py CHANGED Viewed

@@ -9,8 +9,8 @@ import wave
 FRAMES_PER_BUFFER = 1000
 FORMAT = pyaudio.paInt16
-CHANNELS = 1
-RATE = 23500
 device = setup_device()

 FRAMES_PER_BUFFER = 1000
 FORMAT = pyaudio.paInt16
+CHANNELS = 2
+RATE = 26000
 device = setup_device()

SoundScribe/transcribe.py CHANGED Viewed

@@ -17,32 +17,33 @@ silence_duration = 0
 output_file = sf.SoundFile(
     'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)
-model = whisper.load_model("base")
 transcription_in_progress = False
 queued = False
 def transcribe(audio):
     result = model.transcribe(audio)
     transcription = result['text']
     # user = find_user("database/recording.wav")
     user = "Vatsal"
-    if user != "Crystal":
-        with open('./database/input.txt', 'w', encoding="utf-8") as write_to:
-            write_to.write(transcription[1:])
     return transcription, user
 def transcription():
     global transcription_in_progress
-    transcription, user = transcribe_api('database/recording.wav')
     print("-"*100)
-    print(f'Transcription: {transcription} from user {user}')
     print("-"*100)
     transcription_in_progress = False
-def listen(model, stream):
     global transcription_in_progress
     global queued
     global silence_duration
@@ -73,6 +74,8 @@ def listen(model, stream):
                 transcription()
                 queued = False
             silence_duration = 0
             output_file.close()
             audio_data = None
@@ -84,7 +87,7 @@ def live_listen():
     with sd.InputStream(channels=CHANNELS, blocksize=BLOCKSIZE, samplerate=SAMPLE_RATE) as stream:
         print("STARTING LIVE TRANSCRIPTION")
         while True:
-            listen(model, stream)
 if __name__ == "__main__":

 output_file = sf.SoundFile(
     'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)
 transcription_in_progress = False
 queued = False
+first_run = True
+transcription_text = ""
 def transcribe(audio):
+    if first_run:
+        model = whisper.load_model("base")
+        first_run = False
     result = model.transcribe(audio)
     transcription = result['text']
     # user = find_user("database/recording.wav")
     user = "Vatsal"
     return transcription, user
 def transcription():
     global transcription_in_progress
+    global transcription_text
+    transcription_text, user = transcribe_api('database/recording.wav')
     print("-"*100)
+    print(f'Transcription: {transcription_text} from user {user}')
     print("-"*100)
     transcription_in_progress = False
+def listen(stream):
     global transcription_in_progress
     global queued
     global silence_duration
                 transcription()
                 queued = False
+            with open('./database/input.txt', 'w', encoding="utf-8") as write_to:
+                write_to.write(transcription_text[1:])
             silence_duration = 0
             output_file.close()
             audio_data = None
     with sd.InputStream(channels=CHANNELS, blocksize=BLOCKSIZE, samplerate=SAMPLE_RATE) as stream:
         print("STARTING LIVE TRANSCRIPTION")
         while True:
+            listen(stream)
 if __name__ == "__main__":

api_host.py CHANGED Viewed

@@ -21,7 +21,8 @@ app = Flask(__name__)
 def home():
     return jsonify({'message': 'WORKING'})
-def handle_request(func, *args):
     try:
         result = func(*args)
         return jsonify({'message': result})
@@ -36,7 +37,7 @@ def _locate_object():
     image_data = np.array(image_data, dtype=np.uint8)
     image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
     cv2.imwrite('API.jpg', image)
-    return handle_request(locate_object, prompt, "API.jpg")
 @app.route('/vqa', methods=['POST', 'GET'])
 def _vqa():
@@ -45,7 +46,7 @@ def _vqa():
     image_data = np.array(image_data, dtype=np.uint8)
     image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
     cv2.imwrite('API.jpg', image)
-    return handle_request(answer_question, prompt, "API.jpg")
 @app.route('/object_description', methods=['POST', 'GET'])
 def _object_description():
@@ -53,27 +54,27 @@ def _object_description():
     image_data = np.array(image_data, dtype=np.uint8)
     image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
     cv2.imwrite('API.jpg', image)
-    return handle_request(find_object_description, "API.jpg")
 @app.route('/perceptrix', methods=['POST', 'GET'])
 def _perceptrix():
     prompt = request.json['prompt']
-    return handle_request(perceptrix, prompt)
 @app.route('/robotix', methods=['POST', 'GET'])
 def _robotix():
     prompt = request.json['prompt']
-    return handle_request(robotix, prompt)
 @app.route('/search_keyword', methods=['POST', 'GET'])
 def _search_keyword():
     prompt = request.json['prompt']
-    return handle_request(search_keyword, prompt)
 @app.route('/identify_objects_from_text', methods=['POST', 'GET'])
 def _identify_objects_from_text():
     prompt = request.json['prompt']
-    return handle_request(identify_objects_from_text, prompt)
 @app.route('/transcribe', methods=['POST', 'GET'])
 def _upload_audio():

 def home():
     return jsonify({'message': 'WORKING'})
+def thread_task(func, *args):
     try:
         result = func(*args)
         return jsonify({'message': result})
     image_data = np.array(image_data, dtype=np.uint8)
     image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
     cv2.imwrite('API.jpg', image)
+    return thread_task(locate_object, prompt, "API.jpg")
 @app.route('/vqa', methods=['POST', 'GET'])
 def _vqa():
     image_data = np.array(image_data, dtype=np.uint8)
     image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
     cv2.imwrite('API.jpg', image)
+    return thread_task(answer_question, prompt, "API.jpg")
 @app.route('/object_description', methods=['POST', 'GET'])
 def _object_description():
     image_data = np.array(image_data, dtype=np.uint8)
     image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
     cv2.imwrite('API.jpg', image)
+    return thread_task(find_object_description, "API.jpg")
 @app.route('/perceptrix', methods=['POST', 'GET'])
 def _perceptrix():
     prompt = request.json['prompt']
+    return thread_task(perceptrix, prompt)
 @app.route('/robotix', methods=['POST', 'GET'])
 def _robotix():
     prompt = request.json['prompt']
+    return thread_task(robotix, prompt)
 @app.route('/search_keyword', methods=['POST', 'GET'])
 def _search_keyword():
     prompt = request.json['prompt']
+    return thread_task(search_keyword, prompt)
 @app.route('/identify_objects_from_text', methods=['POST', 'GET'])
 def _identify_objects_from_text():
     prompt = request.json['prompt']
+    return thread_task(identify_objects_from_text, prompt)
 @app.route('/transcribe', methods=['POST', 'GET'])
 def _upload_audio():

crystal.py CHANGED Viewed

@@ -16,7 +16,7 @@ if USE_CLOUD_API:
     from utils import answer_question, find_object_description, locate_object, perceptrix, robotix, identify_objects_from_text, search_keyword
 else:
     from CircumSpect import answer_question, find_object_description, locate_object
-    from Perceptrix.engine import perceptrix, robotix, identify_objects_from_text, search_keyword
 SPEECH_COMMAND = "Speak"
@@ -30,7 +30,7 @@ AUTOMATION_COMAND = "Home Automation"
 weather = None
 device = setup_device()
-print("INITIALIZING CRYSTAL - DETECTED DEVICE:", str(device).upper(),
       "Acceleration" if str(device) != "cpu" else "")
@@ -103,7 +103,7 @@ def output(response, input_text):
     if helper:
         record_chat("Helper", helper)
-        response = perceptrix(response+"\nHelper: "+helper)
         output(response)
@@ -168,16 +168,15 @@ while True:
         relevant_history += "\n"+"\n".join(full_history[-3:])
-        relevant_history = f"{relevant_history}\n{username}: " + \
-            "\n" + input_text + "\nCRYSTAL: "
-        response = str(perceptrix(relevant_history))
         response = "<###CRYSTAL-INTERNAL###> Speech\n"+response
         with open("./database/input.txt", 'w') as clearfile:
             clearfile.write("")
     if current_events != "":
         print("CRYSTAL sees:", current_events)
-        response = str(perceptrix(input_text))
         current_events = ""
-    output(response, input_text)

     from utils import answer_question, find_object_description, locate_object, perceptrix, robotix, identify_objects_from_text, search_keyword
 else:
     from CircumSpect import answer_question, find_object_description, locate_object
+    from Perceptrix import perceptrix, robotix, identify_objects_from_text, search_keyword
 SPEECH_COMMAND = "Speak"
 weather = None
 device = setup_device()
+print("INITIALIZING CRYSTAL - DEVICE:", str(device).upper(),
       "Acceleration" if str(device) != "cpu" else "")
     if helper:
         record_chat("Helper", helper)
+        response = perceptrix("<|im_start|>CRYSTAL\n"+response+"<|im_end|>\n<|im_start|>Helper\n"+helper+"\n<|im_start|>CRYSTAL\n")
         output(response)
         relevant_history += "\n"+"\n".join(full_history[-3:])
+        prompt = f"{relevant_history}\n<|im_start|>{username}" + input_text + "<|im_end|>\n<|im_start|>CRYSTAL\n"
+        response = perceptrix(prompt)
         response = "<###CRYSTAL-INTERNAL###> Speech\n"+response
         with open("./database/input.txt", 'w') as clearfile:
             clearfile.write("")
     if current_events != "":
         print("CRYSTAL sees:", current_events)
+        response = perceptrix(input_text)
         current_events = ""
+    output(response, input_text)

database/audio.wav CHANGED Viewed

Binary files a/database/audio.wav and b/database/audio.wav differ

database/chat_history.jsonl CHANGED Viewed

	@@ -0,0 +1,2 @@


1	+ {"ID": "Vatsal", "message": "What is your name"}
2	+ {"ID": "CRYSTAL", "message": "<###CRYSTAL-INTERNAL###> Speech\nMy name is CRYSTAL - Comprehensive Robotics Yielding Sophisticated Technologies and Logistics"}

database/current_frame.jpg CHANGED Viewed

database/current_frame_vqa.jpg ADDED Viewed

database/input.txt CHANGED Viewed

	@@ -1 +1 @@
1	- ~~Hello.~~ ~~Can~~ you ~~hear~~ ~~me?~~


1	+ Tell me what can you do, Crystal.

database/recording.wav CHANGED Viewed

Binary files a/database/recording.wav and b/database/recording.wav differ

internet.py CHANGED Viewed

@@ -156,4 +156,4 @@ if __name__ == "__main__":
     weather = f"{location} is {name} with {temperature} and {details}"
     print(weather)
-    print(web_scraper("top news", True))

     weather = f"{location} is {name} with {temperature} and {details}"
     print(weather)
+    print(web_scraper("top news"))

robot.py CHANGED Viewed

@@ -53,6 +53,50 @@ def find(object):
     pass
 def grab():
     pass

     pass
 def grab():
+    """To find all components of quadrilateral ABCD, given sides AB, BC, CD, and angles A, B, and C, we can use the following steps:
+Find the length of side AD.
+We can use the law of cosines to find the length of AD:
+AD^2 = AB^2 + BC^2 - 2 * AB * BC * cos(C)
+AD = sqrt(AB^2 + BC^2 - 2 * AB * BC * cos(C))
+AD = sqrt(192^2 + 116^2 - 2 * 192 * 116 * cos(118))
+AD = 154.7
+Find the angles of triangle ABC.
+We can use the law of sines to find the angles of triangle ABC:
+sin(A) / BC = sin(B) / AC
+sin(C) / AB = sin(A) / AC
+AC = sin(A) * BC / sin(B)
+AC = sin(100) * 116 / sin(95)
+AC = 120.7
+sin(C) / AC = sin(B) / AB
+BC = sin(C) * AC / sin(B)
+BC = sin(118) * 120.7 / sin(95)
+BC = 126.8
+Now that we know the lengths of all sides of triangle ABC, we can use the law of cosines to find the angles B and C:
+cos(B) = (AC^2 + AB^2 - BC^2) / (2 * AC * AB)
+B = acos((AC^2 + AB^2 - BC^2) / (2 * AC * AB))
+B = 92.9°
+cos(C) = (BC^2 + AB^2 - AC^2) / (2 * BC * AB)
+C = acos((BC^2 + AB^2 - AC^2) / (2 * BC * AB))
+C = 119°
+Find the angle of triangle ADC.
+The angle of triangle ADC is the sum of angles A and B, minus 180 degrees:
+ADC = A + B - 180°
+ADC = 100 + 92.9 - 180°
+ADC = -11.1°
+Now that we know all of the components of quadrilateral ABCD, we can keep decreasing side CD by 1 unit until it is left with 70, and find the angles A, B, and C every time we decrease 1 unit.
+To do this, we can use the following steps:
+Update the length of side CD.
+Find the new length of side AD using the law of cosines.
+Find the new angles of triangle ABC using the law of sines and law of cosines.
+Find the new angle of triangle ADC by subtracting 180 degrees from the sum of angles A and B.
+We can repeat these steps until CD is equal to 70."""
     pass

utils.py CHANGED Viewed

@@ -9,9 +9,11 @@ import os
 API_URL = 'https://bceb7f41087d-7754001953109090881.ngrok-free.app/'
 def get_time():
     return datetime.datetime.now().strftime('%a %d %b %Y %I:%M %p')
 def load_chat():
     full_history = []
     sorted_list = []
@@ -22,14 +24,14 @@ def load_chat():
             id = chat_message['ID']
             message = chat_message['message']
             if id != prev_id:
-                full_history.append(f"{id}: {message}\n")
             else:
                 full_history[-1] += message+"\n"
             prev_id = id
     for chat in full_history:
-        if chat.startswith("CRYSTAL: ") or chat.startswith("Helper: "):
             sorted_list[-1] += "\n"+chat
         else:
             sorted_list.append(chat)
@@ -40,7 +42,7 @@ def load_chat():
 def record_chat(role, message):
     new_message = {
         "ID": role,
-        "message": message[0]
     }
     with open('./database/chat_history.jsonl', 'a') as history:
@@ -68,7 +70,7 @@ def check_api_usage():
     else:
         raise RuntimeError(
             "Unauthorized access! This action will be reported immediately!")
     return USE_CLOUD_API
@@ -114,10 +116,10 @@ def search_keyword(prompt):
 def answer_question(prompt, frame):
     url = API_URL+"vqa"
-    if type(frame) == str:
-        frame = cv2.imread(frame)
     _, image_data = cv2.imencode('.jpg', frame)
     image = image_data.tolist()
     payload = {'image': image,
@@ -167,6 +169,7 @@ def setup_device():
         device = torch.device("cpu")
     return device
 def transcribe(audio):
     url = API_URL + "transcribe"
     with open(audio, 'rb') as audio_file:
@@ -177,7 +180,4 @@ def transcribe(audio):
     print(transcription)
     # user = find_user("database/recording.wav")
     user = "Vatsal"
-    if user != "Crystal":
-        with open('./database/input.txt', 'w', encoding="utf-8") as write_to:
-            write_to.write(transcription[1:])
     return transcription, user

 API_URL = 'https://bceb7f41087d-7754001953109090881.ngrok-free.app/'
 def get_time():
     return datetime.datetime.now().strftime('%a %d %b %Y %I:%M %p')
 def load_chat():
     full_history = []
     sorted_list = []
             id = chat_message['ID']
             message = chat_message['message']
             if id != prev_id:
+                full_history.append(f"<|im_start|>{id}\n{message}<|im_end|>\n")
             else:
                 full_history[-1] += message+"\n"
             prev_id = id
     for chat in full_history:
+        if chat.startswith("<|im_start|>CRYSTAL") or chat.startswith("<|im_start|>Helper"):
             sorted_list[-1] += "\n"+chat
         else:
             sorted_list.append(chat)
 def record_chat(role, message):
     new_message = {
         "ID": role,
+        "message": message
     }
     with open('./database/chat_history.jsonl', 'a') as history:
     else:
         raise RuntimeError(
             "Unauthorized access! This action will be reported immediately!")
     return USE_CLOUD_API
 def answer_question(prompt, frame):
     url = API_URL+"vqa"
+    frame = cv2.imread(frame)
     _, image_data = cv2.imencode('.jpg', frame)
     image = image_data.tolist()
     payload = {'image': image,
         device = torch.device("cpu")
     return device
 def transcribe(audio):
     url = API_URL + "transcribe"
     with open(audio, 'rb') as audio_file:
     print(transcription)
     # user = find_user("database/recording.wav")
     user = "Vatsal"
     return transcription, user