File size: 3,260 Bytes
d38030b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import requests
from openai import OpenAI
import json
import matplotlib.pyplot as plt

class TranscriptProcessor():
    def __init__(self, vids=None, transcript_savepath=None):
        if not vids and not transcript_savepath:
            raise Exception('No transcripts or savepath provided')

        if transcript_savepath:
            with open(transcript_savepath, 'r') as file:
                self.vids = json.load(file)
            file.close()
        else:
            self.vids = vids
            
        self.filtered_vids = vids
    
    def remove_missing_transcripts(self, vids):
        self.filtered_vids = []
        for i, vid in enumerate(vids):
            if 'transcript' in vid:
                self.filtered_vids.append(vid)
            else:
                print("Missing transcript (likely YT Short)", vid['title'])
        return self.filtered_vids
    
    def filter_transcripts(self, vids, bounds=(0, float('inf'))):
        self.filtered_vids = []
        for i, vid in enumerate(vids):
            vid['length'] = len(vid['transcript'])
            if bounds[0] < vid['length'] < bounds[1]:
                self.filtered_vids.append(vid)
        # PLOTTING STATISTICS
        # self.filtered_vids = sorted(self.filtered_vids, key=lambda x: x[1])
        # plt.hist([vid['length'] for vid in self.filtered_vids], bins=50)
        # plt.savefig('transcript_lengths.png')
        return self.filtered_vids
    
    def save_filtered_transcripts(self, savepath):
        with open(savepath, 'w') as file:
            json.dump(self.filtered_vids, file)
        file.close()
        return


class TranscriptSummarizer(TranscriptProcessor):
    def __init__(self, vids=None, transcript_savepath=None):
        super().__init__(vids=vids, transcript_savepath=transcript_savepath)
        self.default_dev_prompt = \
            'You are a transcript summarizer, your purpose is to provide a detailed summary of a transcript without introducing any of your own bias \
            or subjectivity. Shorten the transcript while keeping all important information, including main topics, research findings, their connections \
            and importance, concrete examples demonstrated, reasoning steps, and concrete solutions. Make sure to delve into how the thought process builds \
            and how the conclusions are logically derived. Do not try to be concise. Do not interject your own opinion. Do not structure the output in any \
            way with titles or bullet points.'
    
    def summarize_transcripts(self, vids, model_name='gpt-4o-mini', dev_prompt=self.default_dev_prompt):
        for i, vid in enumerate(vids):
            print('Processing video ', i, vid['title'])
            try:
                completion = client.chat.completions.create(
                    model=model_name,
                    messages=[
                        {'role': 'developer', 'content': dev_prompt},
                        {'role': 'user', 'content': vid['transcript']}
                    ]
                )
            except:
                print('Failed to process video ', i)
                break
            vid['summary'] = completion.choices[0].message.content

        self.filtered_vids = vids
        return vids