katsukiai commited on
Commit
8b0650e
·
verified ·
1 Parent(s): f50c011

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -48
app.py CHANGED
@@ -3,14 +3,14 @@ import csv
3
  import json
4
  import logging
5
  import gradio as gr
 
6
  from tqdm import tqdm
7
  import nltk
8
  from nltk.tokenize import word_tokenize
9
  from nltk.corpus import wordnet
10
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
11
- from huggingface_hub import HfApi, Repository, login
12
  from datasets import Dataset
13
- import pandas as pd
14
  from datetime import datetime
15
  import secrets
16
 
@@ -31,52 +31,43 @@ error_log_file = os.path.join(error_dir, f"errors_{datetime.now().strftime('%Y%m
31
  def log_error(error_msg):
32
  with open(error_log_file, 'a') as f:
33
  f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - {error_msg}\n")
34
- try:
35
- api = HfApi()
36
- api.upload_file(
37
- path_or_fileobj=error_log_file,
38
- path_in_repo=f"errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
39
- repo_id="katsukiai/errors",
40
- repo_type="dataset"
41
- )
42
- except Exception as e:
43
- logging.error(f"Failed to upload error log: {str(e)}")
44
-
45
- tokenizer = AutoTokenizer.from_pretrained("amd/Instella-3B-Instruct",trust_remote_code=True)
46
- model = AutoModelForCausalLM.from_pretrained("amd/Instella-3B-Instruct",trust_remote_code=True)
47
- meaning_generator = pipeline("text2text-generation", model="google/flan-t5-large")
48
 
49
  HF_TOKEN = os.getenv("HF_TOKEN", secrets.token_hex(16))
50
  login(token=HF_TOKEN)
51
 
 
 
 
 
52
  dataset_dir = "dataset"
53
  os.makedirs(dataset_dir, exist_ok=True)
54
  csv_file = os.path.join(dataset_dir, "deepfocus_data.csv")
 
55
 
56
  def process_text_to_csv(input_text):
57
  try:
58
  tokens = word_tokenize(input_text.lower())
59
  words = list(set(tokens))
60
  data = []
 
 
 
 
61
  for word in tqdm(words, desc="Processing words"):
62
- meanings = []
63
- synsets = wordnet.synsets(word)
64
- if synsets:
65
- meanings = [syn.definition() for syn in synsets[:3]]
66
- else:
67
- try:
68
- generated_meaning = meaning_generator(f"Define the word '{word}'", max_length=100)[0]['generated_text']
69
- meanings.append(generated_meaning)
70
- except Exception as e:
71
- log_error(f"Meaning generation failed for '{word}': {str(e)}")
72
- data.append({"tokenizer": tokens, "words": word, "meaning": meanings})
73
-
74
- with open(csv_file, 'w', newline='', encoding='utf-8') as f:
75
- writer = csv.DictWriter(f, fieldnames=["tokenizer", "words", "meaning"])
76
- writer.writeheader()
77
- writer.writerows(data)
78
-
79
- logging.info(f"Dataset saved to {csv_file}")
80
  return data
81
  except Exception as e:
82
  log_error(f"Error in process_text_to_csv: {str(e)}")
@@ -84,9 +75,8 @@ def process_text_to_csv(input_text):
84
 
85
  def upload_to_huggingface():
86
  try:
87
- dataset = Dataset.from_csv(csv_file)
88
  dataset.push_to_hub("katsukiai/DeepFocus-X3", token=HF_TOKEN)
89
- logging.info("Dataset uploaded to Hugging Face")
90
  except Exception as e:
91
  log_error(f"Error uploading to Hugging Face: {str(e)}")
92
  raise
@@ -103,10 +93,7 @@ def generate_output(input_text):
103
  def view_logs():
104
  try:
105
  log_files = os.listdir(log_dir)
106
- log_content = ""
107
- for log_file in log_files:
108
- with open(os.path.join(log_dir, log_file), 'r') as f:
109
- log_content += f"\n\n--- {log_file} ---\n\n{f.read()}"
110
  return log_content
111
  except Exception as e:
112
  log_error(f"Error in view_logs: {str(e)}")
@@ -117,13 +104,7 @@ with gr.Blocks(title="DeepFocus-X3") as demo:
117
 
118
  with gr.Tabs():
119
  with gr.TabItem("About"):
120
- gr.Markdown("""
121
- ## About DeepFocus-X3
122
- This application processes text, tokenizes it, extracts unique words, generates meanings, and uploads the dataset to Hugging Face.
123
- - Uses NLTK for tokenization and WordNet for meanings.
124
- - Leverages DeepSeek AI for long text processing and Google FLAN-T5 for meaning generation.
125
- - Logs all activities and errors, with error logs uploaded to Hugging Face.
126
- """)
127
 
128
  with gr.TabItem("Generate all"):
129
  input_text = gr.Textbox(label="Input Text", lines=10)
@@ -137,4 +118,4 @@ with gr.Blocks(title="DeepFocus-X3") as demo:
137
  view_logs_btn = gr.Button("View Logs")
138
  view_logs_btn.click(fn=view_logs, inputs=None, outputs=log_output)
139
 
140
- demo.launch()
 
3
  import json
4
  import logging
5
  import gradio as gr
6
+ import pandas as pd
7
  from tqdm import tqdm
8
  import nltk
9
  from nltk.tokenize import word_tokenize
10
  from nltk.corpus import wordnet
11
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
+ from huggingface_hub import HfApi, login
13
  from datasets import Dataset
 
14
  from datetime import datetime
15
  import secrets
16
 
 
31
  def log_error(error_msg):
32
  with open(error_log_file, 'a') as f:
33
  f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - {error_msg}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  HF_TOKEN = os.getenv("HF_TOKEN", secrets.token_hex(16))
36
  login(token=HF_TOKEN)
37
 
38
+ tokenizer = AutoTokenizer.from_pretrained("amd/Instella-3B-Instruct", trust_remote_code=True)
39
+ model = AutoModelForCausalLM.from_pretrained("amd/Instella-3B-Instruct", trust_remote_code=True)
40
+ meaning_generator = pipeline("text2text-generation", model="google/flan-t5-large")
41
+
42
  dataset_dir = "dataset"
43
  os.makedirs(dataset_dir, exist_ok=True)
44
  csv_file = os.path.join(dataset_dir, "deepfocus_data.csv")
45
+ parquet_file = os.path.join(dataset_dir, "deepfocus_data.parquet")
46
 
47
  def process_text_to_csv(input_text):
48
  try:
49
  tokens = word_tokenize(input_text.lower())
50
  words = list(set(tokens))
51
  data = []
52
+
53
+ existing_df = pd.read_parquet(parquet_file) if os.path.exists(parquet_file) else pd.DataFrame(columns=["words", "meaning"])
54
+ existing_words = set(existing_df["words"].tolist())
55
+
56
  for word in tqdm(words, desc="Processing words"):
57
+ if word in existing_words:
58
+ continue
59
+
60
+ meanings = [syn.definition() for syn in wordnet.synsets(word)[:3]] or \
61
+ [meaning_generator(f"Define the word '{word}'", max_length=100)[0]['generated_text']]
62
+
63
+ data.append({"words": word, "meaning": meanings})
64
+
65
+ if data:
66
+ new_df = pd.DataFrame(data)
67
+ combined_df = pd.concat([existing_df, new_df], ignore_index=True)
68
+ combined_df.to_parquet(parquet_file, index=False)
69
+ combined_df.to_csv(csv_file, index=False, encoding='utf-8')
70
+
 
 
 
 
71
  return data
72
  except Exception as e:
73
  log_error(f"Error in process_text_to_csv: {str(e)}")
 
75
 
76
  def upload_to_huggingface():
77
  try:
78
+ dataset = Dataset.from_parquet(parquet_file)
79
  dataset.push_to_hub("katsukiai/DeepFocus-X3", token=HF_TOKEN)
 
80
  except Exception as e:
81
  log_error(f"Error uploading to Hugging Face: {str(e)}")
82
  raise
 
93
  def view_logs():
94
  try:
95
  log_files = os.listdir(log_dir)
96
+ log_content = "".join(f"\n\n--- {log_file} ---\n\n{open(os.path.join(log_dir, log_file), 'r').read()}" for log_file in log_files)
 
 
 
97
  return log_content
98
  except Exception as e:
99
  log_error(f"Error in view_logs: {str(e)}")
 
104
 
105
  with gr.Tabs():
106
  with gr.TabItem("About"):
107
+ gr.Markdown("## About DeepFocus-X3\nThis application processes text, tokenizes it, extracts unique words, generates meanings, and uploads the dataset to Hugging Face.")
 
 
 
 
 
 
108
 
109
  with gr.TabItem("Generate all"):
110
  input_text = gr.Textbox(label="Input Text", lines=10)
 
118
  view_logs_btn = gr.Button("View Logs")
119
  view_logs_btn.click(fn=view_logs, inputs=None, outputs=log_output)
120
 
121
+ demo.launch()