-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathImage_to_text_pipeline.py
78 lines (62 loc) · 2.17 KB
/
Image_to_text_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import warnings
import logging
import cv2
import numpy as np
from gtts import gTTS
import sounddevice as sd
import soundfile as sf
import tempfile
from PIL import Image
import os
# Set the environment variable
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)
from transformers import pipeline
caption = pipeline('image-to-text')
def T_T_speech(text, Language='en'):
try:
# Generate speech
myobj = gTTS(text=text, lang=Language, slow=False)
# Save to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as fp:
myobj.write_to_fp(fp)
fp.seek(0)
# Read the audio file and play it
with sf.SoundFile(fp.name, 'r') as sound_file:
data = sound_file.read(dtype='int16')
sd.play(data, sound_file.samplerate)
sd.wait()
except Exception as e:
print(f"An error occurred: {e}")
def is_scene_change(current_frame, previous_frame, threshold=1000):
# Calculate difference
difference = cv2.absdiff(current_frame, previous_frame)
non_zero_count = np.count_nonzero(difference)
return non_zero_count > threshold
cap = cv2.VideoCapture(0)
ret, previous_frame = cap.read()
while True:
ret, current_frame = cap.read()
if not ret:
print("Failed to capture image")
break
# Check for scene change
if is_scene_change(current_frame, previous_frame):
# Convert OpenCV frame (BGR) to PIL Image (RGB)
frame_rgb = cv2.cvtColor(current_frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
# Process the frame for captioning
try:
captions = caption(pil_image)
captions_text = str(captions[0]['generated_text']) # Convert captions to string if necessary
print(captions_text)
T_T_speech(captions_text)
except Exception as e:
print(f"Error in captioning: {e}")
previous_frame = current_frame.copy()
cv2.imshow('Webcam Feed', current_frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()