Spaces:

NathanPap
/

TinyLlama-TinyLlama-1.1B-Chat-v1.0

Running

App Files Files Community

NathanPap commited on Feb 16

Commit

0c95fb4

verified ·

1 Parent(s): d05a699

Update utils.py

Browse files

Files changed (1) hide show

utils.py +61 -25

utils.py CHANGED Viewed

@@ -24,30 +24,60 @@ class CSVAnalyzer:
         self.tokenizer, self.model = CSVAnalyzer.load_model()
     def prepare_context(self, df: pd.DataFrame) -> str:
         try:
             if df.empty:
                 return "Keine Daten verfügbar"
-            # Création d'un résumé plus concis et direct
-            building_problems = {}
-            for _, row in df.iterrows():
-                building = row['Gebäude'] if 'Gebäude' in df.columns else 'Unbekannt'
-                betreff = row['Betreff'] if 'Betreff' in df.columns else ''
-                inhalt = row['Inhalt'] if 'Inhalt' in df.columns else ''
-                if building not in building_problems:
-                    building_problems[building] = []
-                building_problems[building].append(f"{betreff}: {inhalt}")
-            # Construction du contexte
-            context = "Probleme nach Gebäude:\n\n"
-            for building, problems in building_problems.items():
-                context += f"GEBÄUDE {building}:\n"
-                for problem in problems:
-                    context += f"- {problem}\n"
-                context += "\n"
             return context
@@ -55,17 +85,23 @@ class CSVAnalyzer:
             raise Exception(f"Fehler bei der Kontextvorbereitung: {str(e)}")
     def generate_response(self, context: str, query: str) -> str:
-        prompt = f"""<|system|>Analysiere die folgenden Gebäudeprobleme und bestimme das problematischste Gebäude.
 Berücksichtige dabei:
-1. Schwere der Probleme (Heizung/Klima = kritisch, Reinigung = weniger kritisch)
-2. Anzahl der Probleme
-3. Auswirkungen auf Nutzer
 <|user|>
 {context}
 {query}
-Nenne direkt das problematischste Gebäude und begründe kurz warum.
 <|assistant|>"""
@@ -74,7 +110,7 @@ Nenne direkt das problematischste Gebäude und begründe kurz warum.
                 prompt,
                 return_tensors="pt",
                 truncation=True,
-                max_length=1024,
                 padding=True,
                 return_attention_mask=True
             )
@@ -83,8 +119,8 @@ Nenne direkt das problematischste Gebäude und begründe kurz warum.
                 outputs = self.model.generate(
                     input_ids=inputs["input_ids"],
                     attention_mask=inputs["attention_mask"],
-                    max_new_tokens=128,  # Réduit pour des réponses plus concises
-                    temperature=0.1,     # Réduit pour des réponses plus directes
                     top_p=0.95,
                     repetition_penalty=1.15,
                     do_sample=True

         self.tokenizer, self.model = CSVAnalyzer.load_model()
     def prepare_context(self, df: pd.DataFrame) -> str:
+        """
+        Bereitet einen allgemeinen Kontext aus dem DataFrame vor.
+        """
         try:
             if df.empty:
                 return "Keine Daten verfügbar"
+            # Grundlegende Informationen
+            context = "DATASET ÜBERSICHT:\n\n"
+            # Dimensionen
+            context += f"Datensatzgröße: {len(df)} Zeilen, {len(df.columns)} Spalten\n\n"
+            # Spaltenliste mit Datentypen
+            context += "SPALTENINFORMATIONEN:\n"
+            for col in df.columns:
+                dtype = df[col].dtype
+                non_null = df[col].count()
+                null_percentage = (len(df) - non_null) / len(df) * 100
+                # Erkennung spezieller Datentypen
+                if pd.api.types.is_datetime64_any_dtype(df[col]):
+                    date_range = f"von {df[col].min()} bis {df[col].max()}"
+                    context += f"- {col} (Datum): {date_range}, {null_percentage:.1f}% NULL\n"
+                elif pd.api.types.is_numeric_dtype(df[col]):
+                    context += f"- {col} (Numerisch): Min={df[col].min():.2f}, Max={df[col].max():.2f}, {null_percentage:.1f}% NULL\n"
+                else:
+                    unique_values = df[col].nunique()
+                    context += f"- {col} (Text): {unique_values} eindeutige Werte, {null_percentage:.1f}% NULL\n"
+            # Grundlegende Statistiken für numerische Spalten
+            numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
+            if not numeric_cols.empty:
+                context += "\nNUMERISCHE STATISTIKEN:\n"
+                stats = df[numeric_cols].describe()
+                for col in numeric_cols:
+                    context += f"- {col}:\n"
+                    context += f"  Durchschnitt: {stats[col]['mean']:.2f}\n"
+                    context += f"  Median: {stats[col]['50%']:.2f}\n"
+                    context += f"  Standardabweichung: {stats[col]['std']:.2f}\n"
+            # Zeitliche Informationen, falls vorhanden
+            date_cols = df.select_dtypes(include=['datetime64']).columns
+            if not date_cols.empty:
+                context += "\nZEITLICHE INFORMATIONEN:\n"
+                for col in date_cols:
+                    context += f"- {col}:\n"
+                    context += f"  Zeitspanne: von {df[col].min()} bis {df[col].max()}\n"
+                    context += f"  Anzahl eindeutiger Daten: {df[col].nunique()}\n"
+            # Stichprobe der Daten
+            context += "\nDATENBEISPIELE:\n"
+            sample = df.head(3).to_string()
+            context += f"{sample}\n"
             return context
             raise Exception(f"Fehler bei der Kontextvorbereitung: {str(e)}")
     def generate_response(self, context: str, query: str) -> str:
+        prompt = f"""<|system|>
+Du bist ein Datenanalyst, der CSV-Dateien analysiert. Deine Aufgabe ist es, Fragen über die Daten zu beantworten.
 Berücksichtige dabei:
+1. Die Struktur und die Arten der verfügbaren Daten
+2. Statistische Informationen, falls relevant
+3. Mögliche Zusammenhänge zwischen verschiedenen Spalten
+4. Zeitliche Muster, falls Datumsinformationen vorhanden sind
+Antworte präzise und faktenbasiert. Wenn die Frage nicht mit den verfügbaren Daten beantwortet werden kann,
+erkläre warum.
 <|user|>
+KONTEXT:
 {context}
+FRAGE:
 {query}
 <|assistant|>"""
                 prompt,
                 return_tensors="pt",
                 truncation=True,
+                max_length=2048,  # Erhöht für komplexere Analysen
                 padding=True,
                 return_attention_mask=True
             )
                 outputs = self.model.generate(
                     input_ids=inputs["input_ids"],
                     attention_mask=inputs["attention_mask"],
+                    max_new_tokens=256,  # Erhöht für ausführlichere Antworten
+                    temperature=0.7,     # Erhöht für kreativere Analysen
                     top_p=0.95,
                     repetition_penalty=1.15,
                     do_sample=True