KamilZPWr · pwrw8 · Nov 14, 2020 · Nov 14, 2020 · Nov 15, 2020 · Nov 15, 2020
diff --git a/second-task/models/speech_to_text.py b/second-task/models/speech_to_text.py
@@ -1,18 +1,21 @@
-import json
 from typing import List, Tuple
 
+import pandas as pd
 import speech_recognition as sr
-from pydub import AudioSegment  #
+from pydub import AudioSegment
+
+from models.labels_matcher import get_similar_label, EXPECTED_LABELS
 
 
 def divide_chunks(sound, chunk_size):
     for i in range(0, len(sound), chunk_size):
         yield sound[i:i + chunk_size]
 
 
-def process(file_path: str, chunk_size: int = 60000, apikey=None) -> List[Tuple[int, str]]:
+def process(file_path: str, chunk_size: int = 60000, apikey=None) -> Tuple[List[Tuple[int, str]], int]:
 
     sound = AudioSegment.from_mp3(file_path)
+    seconds = sound.duration_seconds
     chunks = list(divide_chunks(sound, chunk_size))
 
     r = sr.Recognizer()
@@ -24,6 +27,7 @@ def process(file_path: str, chunk_size: int = 60000, apikey=None) -> List[Tuple[
 
         with sr.AudioFile(wav_file_path) as source:
             audio = r.record(source)
+
         if apikey:
             s = r.recognize_google_cloud(audio, language="pl-PL", credentials_json=apikey)
             words_per_second = len(s.split()) / 60
@@ -35,6 +39,17 @@ def process(file_path: str, chunk_size: int = 60000, apikey=None) -> List[Tuple[
             s = r.recognize_google(audio, language="pl-PL")
             results.append((index*5, s))
 
+    return results, seconds
+
+
+def assign_label(chunks: List[Tuple[int, str]], max_second) -> pd.DataFrame:
+    results = pd.DataFrame(data={'second': list(range(1, int(max_second)))})
+    results[EXPECTED_LABELS] = 0
+
+    for chunk in chunks:
+        labels = get_similar_label(chunk[1])
+        results.loc[results['second'] == chunk[0], labels] = 1
+
     return results
 
 

diff --git a/second-task/models/translator.py b/second-task/models/translator.py
@@ -0,0 +1,25 @@
+from typing import List, Tuple
+
+from google.cloud import translate_v2
+from google.oauth2 import service_account
+
+
+def translate(text, credentials):
+    translate_client = translate_v2.Client(credentials=credentials)
+    result = translate_client.translate(text, target_language='en', source_language='pl')
+    return result["translatedText"]
+
+
+def translate_list_of_chunks(chunks: List[Tuple[int, str]], apikey_path: str) -> List[Tuple[int, str]]:
+    result_list = []
+    credentials = service_account.Credentials.from_service_account_file(apikey_path)
+
+    for chunk in chunks:
+        translated = translate(chunk[1], credentials)
+        result_list.append((chunk[0], translated))
+    return result_list
+
+
+if __name__ == '__main__':
+    chunks_list = [(0, "Wszedł kotek na płotek. Lubię placki."), (1, "inny język"), (2, 'ostatnia partia')]
+    translate_list_of_chunks(chunks_list, 'apikey.json')
diff --git a/second-task/requirements.txt b/second-task/requirements.txt
@@ -28,4 +28,5 @@ moviepy==1.0.3
 opencv-python==4.4.0.46
 pandas==1.1.4
 grpcio
-grpcio-tools
+grpcio-tools
+google-cloud-translate==3.0.1
diff --git a/second-task/service/celery_tasks/celery_task.py b/second-task/service/celery_tasks/celery_task.py
@@ -1,33 +1,32 @@
-import json
-import os
-
 from google.oauth2 import service_account
 
 from models import speech_to_text, video_to_labels
+from models.speech_to_text import assign_label
+from models.translator import translate_list_of_chunks
 from service.api import celery
 from service.storage_manager import Storage, update_status, JobStatus
 
 
 @celery.task
 def process_speech_to_text(job_id: str):
     mp3_path = Storage.get_input_mp3_path(job_id).replace('../', './')
-    results = speech_to_text.process(mp3_path)
-    with open(mp3_path.replace('.mp3', '.json'), 'w+') as file:
-        json.dump(results, file)
-    # TODO: call pdf creation
+    labels, seconds = speech_to_text.process(mp3_path)
+    translated_labels = translate_list_of_chunks(labels, Storage.get_apikey_path())
+    results = assign_label(translated_labels, seconds)
+    results_path = Storage.get_results_path(job_id)
+    print(results_path)
+    print(results.head())
+    results.to_csv(results_path, index=False)
     update_status(job_id, JobStatus.finished)
 
 
 @celery.task
 def process_video_to_labels(job_id: str):
-
-    import os
-    print(os.path.dirname(os.path.realpath(__file__)))
-
     mp4_path = Storage.get_input_mp4_path(job_id).replace('../', './')
     credentials_path = Storage.get_apikey_path()
     credentials = service_account.Credentials.from_service_account_file(credentials_path)
     labels = video_to_labels.process(mp4_path, credentials)
-    results_path = os.path.join(Storage.get_job_root(job_id).replace('../', './'), 'results.csv')
+    results_path = Storage.get_results_path(job_id)
+    print(labels.head())
     labels.to_csv(results_path, index=False)
     update_status(job_id, JobStatus.finished)
diff --git a/second-task/service/storage_manager.py b/second-task/service/storage_manager.py
@@ -27,7 +27,7 @@ def get_job_root(job_id):
 
     @staticmethod
     def get_results_path(job_id):
-        return os.path.join('..', Storage.get_job_root(job_id), 'results.pdf')
+        return os.path.join(Storage.get_job_root(job_id), 'results.csv')
 
     @staticmethod
     def get_input_mp3_path(job_id):