-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_transcript.py
153 lines (119 loc) · 4.57 KB
/
process_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import nltk
from nltk.tokenize import sent_tokenize, TextTilingTokenizer
from sentence_transformers import SentenceTransformer
import scipy.cluster.hierarchy as hcluster
import spacy
import pickle
from pathlib import Path
import inquirer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import BertTokenizer, BertForMaskedLM
import torch
os.environ['NLTK_DATA'] = '/Users/daviddickinson/nltk_data'
def clean_transcript(transcript):
# remove 'uh' and 'um' from transcript
cleaned_transcript = transcript.replace('uh', '').replace('um', '')
return cleaned_transcript
def process_transcript_nltk(transcript):
# sentences = sent_tokenize(transcript)
# return sentences
tt = TextTilingTokenizer()
sentences = tt.tokenize(transcript)
return sentences
def process_transcript_spacy(transcript):
nlp = spacy.load('en_core_web_sm')
doc = nlp(transcript)
sentences = [sent.text for sent in doc.sents]
return sentences
def process_transcript_bert_huggingface(transcript):
# model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
# nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
# result = nlp(transcript)
# return result
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()
words = transcript.split()
punctuated_transcript = []
# Break the transcript into chunks of max_length tokens
max_length = 512
chunks = [words[i:i + max_length] for i in range(0, len(words), max_length)]
for chunk in chunks:
for i in range(len(chunk)):
masked_transcript = chunk[:i] + ["[MASK]"] + chunk[i+1:]
input_ids = tokenizer.encode(" ".join(masked_transcript), return_tensors='pt')
with torch.no_grad():
outputs = model(input_ids)
predictions = outputs[0]
predicted_index = torch.argmax(predictions[0, i+1]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
punctuated_transcript.append(chunk[i])
if predicted_token in [".", ",", "?", "!"]:
punctuated_transcript.append(predicted_token)
punctuated_transcript = " ".join(punctuated_transcript)
return punctuated_transcript
def select_directory(subdirs):
# # Print out a menu with all the subdirectories
# for i, subdir in enumerate(subdirs, start=1):
# print(f"{i}. {subdir.name}")
# # Ask the user to select a directory
# choice = int(input("Select a directory: ")) - 1
# selected_dir = subdirs[choice]
# Create a list of subdir names
subdir_names = [subdir.name for subdir in subdirs]
# Create a list prompt
questions = [
inquirer.List('chosen_dir',
message="Select a directory",
choices=subdir_names,
)
]
# Get the user's answer
answers = inquirer.prompt(questions)
# Find the selected subdir
selected_dir = next(subdir for subdir in subdirs if subdir.name == answers['chosen_dir'])
return selected_dir
def select_segmenter():
# Create a list prompt
questions = [
inquirer.List('chosen_segmenter',
message="Select a segmenter",
choices=['nltk', 'spacy', 'bert'],
)
]
# Get the user's answer
answers = inquirer.prompt(questions)
# Find the selected subdir
selected_segmenter = answers['chosen_segmenter']
return selected_segmenter
def app():
transcripts_path = Path('transcripts')
# Get a list of all subdirectories in the transcripts folder
subdirs = [d for d in transcripts_path.iterdir() if d.is_dir()]
# Ask the user to select a directory
selected_dir = select_directory(subdirs)
# Ask the user to select a segmenter (e.g. nltk, spacy, etc.
selected_segmenter = select_segmenter()
# Should only be one transcript ending in _transcript.txt
transcript_path = [f for f in selected_dir.iterdir() if f.name.endswith('_transcript.txt')][0]
# Read the transcript
with transcript_path.open('r', encoding='utf-8') as f:
transcript = f.read()
# Clean the transcript
cleaned_transcript = clean_transcript(transcript)
# Process the transcript
if selected_segmenter == 'nltk':
sentences = process_transcript_nltk(cleaned_transcript)
elif selected_segmenter == 'spacy':
sentences = process_transcript_spacy(cleaned_transcript)
elif selected_segmenter == 'bert':
sentences = process_transcript_bert_huggingface(cleaned_transcript)
else:
raise ValueError(f"Invalid segmenter: {selected_segmenter}")
print(sentences)
if __name__ == '__main__':
nltk.download('punkt') # Ensure the Punkt tokenizer is downloaded
app()