Skip to content

Commit

Permalink
...
Browse files Browse the repository at this point in the history
  • Loading branch information
CursedPrograms committed Feb 13, 2024
1 parent 814bf65 commit 5cb8939
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 67 deletions.
8 changes: 4 additions & 4 deletions scripts/play_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def play_audio(audio_path):
pygame.mixer.music.stop()
play_count += 1

try:
os.remove(audio_path)
except Exception as e:
print(f"Error while removing file: {e}")
# try:
# os.remove(audio_path)
# except Exception as e:
# print(f"Error while removing file: {e}")
146 changes: 83 additions & 63 deletions scripts/robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
from gpt.system.generate_text import generate_text
from gpt.system.clean_text import clean_text
import sys
import webbrowser
from PIL import Image
import glob
import os

transformers.logging.set_verbosity_error()
tf.get_logger().setLevel(logging.ERROR)
Expand All @@ -44,45 +48,46 @@ def download_caffe_model_files():
download_file(caffemodel_url, caffemodel_path)

def extract_gender_and_age(frame, face_cascade):
global subject_age, subject_gender
subject_age = None
subject_gender = None

try:
# Convert the frame to RGB (required by DeepFace)
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

# Use Haar Cascade to detect faces
gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

for (x, y, w, h) in faces:
# Extract face region
face_roi = frame[y:y+h, x:x+w]

# Use DeepFace to analyze gender and age
result = DeepFace.analyze(face_roi, actions=['gender', 'age'])

gender_stats = result[0]['gender']
age = int(result[0]['age'])
print("Predicted Gender:", gender_stats)
print("Predicted Age:", age)

# Select the dominant gender label
gender = max(gender_stats, key=gender_stats.get)

# Display gender and age information on the frame
cv2.putText(frame, f"Gender: {gender} - {gender_stats[gender]:.2f}%", (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
cv2.putText(frame, f"Age: {age} years", (x, y + h + 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

# Draw rectangle around the face
cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
result = DeepFace.analyze(face_roi, actions=['gender', 'age'], enforce_detection=False)

if 'gender' in result[0] and 'age' in result[0]:
gender_stats = result[0]['gender']
age = int(result[0]['age'])
print("Predicted Gender:", gender_stats)
print(f"Predicted Gender: {max(gender_stats, key=gender_stats.get)}")
print("Predicted Age:", age)
text_to_speech(f"Predicted Age: {age}")
text_to_speech(f"Predicted Gender: {gender_stats}")
text_to_speech(f"You are a {max(gender_stats, key=gender_stats.get)}")

subject_age = age
subject_gender = (f"You are a {max(gender_stats, key=gender_stats.get)}")
gender = max(gender_stats, key=gender_stats.get)

cv2.putText(frame, f"Gender: {gender} - {gender_stats[gender]:.2f}%", (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
cv2.putText(frame, f"Age: {age} years", (x, y + h + 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)

return frame

except Exception as e:
print(f"Error: {e}")
return frame

def capture_audio(duration=30, sample_rate=44100):
def capture_audio(duration=15, sample_rate=44100):
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
channels=1,
Expand All @@ -92,19 +97,18 @@ def capture_audio(duration=30, sample_rate=44100):

frames = []
print("Recording audio...")

text_to_speech("I am listening...")
for i in range(0, int(sample_rate / 1024 * duration)):
data = stream.read(1024)
frames.append(data)

print("Finished recording.")
text_to_speech("I am thinking...")
stream.stop_stream()
stream.close()
p.terminate()

audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)

# Save audio data to a temporary file
temp_audio_file = "output/audio/temp_audio.wav"
os.makedirs(os.path.dirname(temp_audio_file), exist_ok=True)
wf = wave.open(temp_audio_file, 'wb')
Expand All @@ -118,38 +122,24 @@ def capture_audio(duration=30, sample_rate=44100):

def text_to_speech(text):
cleaned_text = clean_text(text)

# Generate a unique filename with a timestamp
timestamp = int(time.time())
audio_path = f"output/audio/output_{timestamp}.mp3"

os.makedirs(os.path.dirname(audio_path), exist_ok=True)

tts = gTTS(text=cleaned_text, lang='en', slow=False)
tts.save(audio_path)
play_audio(audio_path)

def capture_photo(output_dir="output/shot"):
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Open a connection to the webcam
cap = cv2.VideoCapture(0)

# Capture a single frame
ret, frame = cap.read()

# Release the webcam
cap.release()

# Save the captured frame to the specified directory
image_path = os.path.join(output_dir, "captured_photo.jpg")
cv2.imwrite(image_path, frame)

return image_path

def image_description(image_path):
# Load the pre-trained MobileNet SSD model and its class labels
net = cv2.dnn.readNetFromCaffe("models/deploy.prototxt", "models/mobilenet_iter_73000.caffemodel")
classes = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair",
"cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa",
Expand All @@ -159,8 +149,6 @@ def image_description(image_path):
image = cv2.imread(image_path)
h, w = image.shape[:2]
blob = cv2.dnn.blobFromImage(image, 0.007843, (300, 300), 127.5)

# Pass the blob through the network and obtain the detections
net.setInput(blob)
detections = net.forward()

Expand All @@ -169,18 +157,19 @@ def image_description(image_path):
confidence = detections[0, 0, i, 2]
if confidence > 0.5: # Confidence threshold
class_id = int(detections[0, 0, i, 1])
description = f"A photo containing a {classes[class_id]} with confidence {confidence:.2f}"
print(f"A photo containing a {classes[class_id]} with {confidence:.2f}")
description = f"I am looking at a {classes[class_id]}"
text_to_speech(f"A photo containing a {classes[class_id]} with {confidence:.2f}")
return description

return "Unable to identify the content of the photo."

def text_to_image(prompt):
def text_to_image(prompt, output_directory='output/generated_images'):
print("Using SDXL-Turbo for Text-to-image:")
print("Make sure to install the required packages using:")
print("pip install diffusers transformers accelerate --upgrade")
print()

output_directory = 'output/generated_images'
os.makedirs(output_directory, exist_ok=True)

# Use a counter to create a unique filename
Expand All @@ -198,15 +187,21 @@ def text_to_image(prompt):
pipe.to("cpu")
prompt = "{prompt}"
image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
image.save(os.path.join(output_directory, '{output_filename}'))
image.save(os.path.join('{output_directory}', '{output_filename}'))
"""

print("Generated code snippet:")
print(text_to_image_code)

exec(text_to_image_code)
text_to_speech("I drew this:")
return Image.open(os.path.join(output_directory, output_filename))

def main():

output_directory = 'output/generated_images'
os.makedirs(output_directory, exist_ok=True)

# Load GPT-2 model for text generation
model_name = "gpt2"
gpt_model = TFAutoModelForCausalLM.from_pretrained(model_name)
Expand All @@ -224,13 +219,38 @@ def main():
# Speech recognizer
recognizer = sr.Recognizer()

image_displayed = False

while True:

if not image_displayed:

mp3_files = glob.glob("output/audio/*.mp3")
for mp3_file in mp3_files:
os.remove(mp3_file)

text_to_speech("Hello")

initial_prompt = "A Robotic Lady"
initial_generated_image = text_to_image(initial_prompt, output_directory)

# Save the JpegImageFile object to a file
initial_generated_image_path = os.path.join(output_directory, "initial_generated_image.jpg")
initial_generated_image.save(initial_generated_image_path)

# Open the saved image using the default image viewer
webbrowser.open(initial_generated_image_path)

# Set the flag to True to indicate that the image has been displayed
image_displayed = True

# Capture video from webcam
ret, frame = cap.read()

# Check if frame is valid
if not ret:
print("Error: Couldn't capture frame from the webcam.")
text_to_speech("Error: Couldn't capture frame from the webcam.")
break

# Apply face detection and gender/age analysis to the frame
Expand All @@ -250,39 +270,39 @@ def main():
audio = recognizer.record(source)
audio_text = recognizer.recognize_google(audio)
print(f"Audio Text: {audio_text}")
text_to_speech(f"Audio Text: {audio_text}")
except sr.UnknownValueError:
print("Could not understand audio.")
text_to_speech("Could not understand audio.")
except sr.RequestError as e:
print(f"Error with the speech recognition service; {e}")

# Construct prompt based on gender, age, audio, and image description
gender_prompt = "man" if "man" in frame else "woman" if "woman" in frame else "person"
age_prompt = "33" # Replace with the detected age
gender_prompt = subject_gender
age_prompt = str(subject_age) if subject_age else ""
image_desc = image_description(captured_image_path)

prompt = f"A {gender_prompt} {age_prompt}, {audio_text}, {image_desc}"

# Generate text based on the prompt
generated_text = generate_text(prompt, gpt_model, tokenizer)

# Print and say the generated text
if generated_text:
print("Generated Text:")
print(generated_text)
text_to_speech(generated_text)

# Draw an image based on the prompt
if prompt:
text_to_image(prompt)

# Break the loop when 'q' key is pressed
generated_image = text_to_image(prompt, output_directory)
text_to_speech(f"I am drawing a: {prompt}")
if cv2.waitKey(1) & 0xFF == ord('q'):
break

# Release the webcam and close all windows
cap.release()
generated_image_path = os.path.join(output_directory, "initial_generated_image.jpg")
generated_image.save(initial_generated_image_path)

webbrowser.open(generated_image_path)

if cv2.waitKey(1) & 0xFF == ord('q'):
break

cv2.destroyAllWindows()

if __name__ == "__main__":
sys.stdout = sys.__stdout__
main()

0 comments on commit 5cb8939

Please sign in to comment.