From b01d70a9a9b1b04994775b1063902faeef521908 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Cupek?= <cupek@icloud.com>
Date: Sat, 14 Nov 2020 17:58:19 +0100
Subject: [PATCH 1/3] Added translator module for better audio transcription
 classification.

---
 second-task/models/translator.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 second-task/models/translator.py

diff --git a/second-task/models/translator.py b/second-task/models/translator.py
new file mode 100644
index 0000000..029de91
--- /dev/null
+++ b/second-task/models/translator.py
@@ -0,0 +1,26 @@
+from google.cloud import translate_v2
+from google.oauth2 import service_account
+
+
+def translate(text):
+    my_credentials = service_account.Credentials.from_service_account_file('apikey.json')
+    translate_client = translate_v2.Client(credentials=my_credentials)
+    result = translate_client.translate(text, target_language='en', source_language='pl')
+
+    # print(u"Text: {}".format(result["input"]))
+    # print(u"Translation: {}".format(result["translatedText"]))
+    return result["translatedText"]
+
+
+def translate_list_of_chunks(chunks: list) -> list:
+    ''' Accepts list of tuples like [(int, string), ...]'''
+    result_list = []
+    for chunk in chunks:
+        translated = translate(chunk[1])
+        result_list.append((chunk[0], translated))
+    return result_list
+
+
+if __name__ == '__main__':
+    chunks_list = [(0, "Wszedł kotek na płotek. Lubię placki."), (1, "inny język"), (2, 'ostatnia partia')]
+    translate_list_of_chunks(chunks_list)

From 845d25dd6f4a9bffa2f4b80ae10b39cdbf61f3d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Cupek?= <cupek@icloud.com>
Date: Sat, 14 Nov 2020 17:58:50 +0100
Subject: [PATCH 2/3] fixed docstring

---
 second-task/models/translator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/second-task/models/translator.py b/second-task/models/translator.py
index 029de91..367f94a 100644
--- a/second-task/models/translator.py
+++ b/second-task/models/translator.py
@@ -13,7 +13,7 @@ def translate(text):
 
 
 def translate_list_of_chunks(chunks: list) -> list:
-    ''' Accepts list of tuples like [(int, string), ...]'''
+    """ Accepts list of tuples like [(int, string), ...]"""
     result_list = []
     for chunk in chunks:
         translated = translate(chunk[1])

From 3924ad5150d4da6e76f97a2c5f360d28167bfb5a Mon Sep 17 00:00:00 2001
From: Kamil Zawistowski <kamil.zawistowski@gds.ey.com>
Date: Sun, 15 Nov 2020 04:56:31 +0100
Subject: [PATCH 3/3] integrate chunk audio with labels assigment and webapp

---
 second-task/models/speech_to_text.py          | 28 ++++++++++++++++---
 second-task/models/translator.py              | 19 ++++++-------
 second-task/requirements.txt                  |  3 +-
 .../service/celery_tasks/celery_task.py       | 23 ++++++++-------
 second-task/service/storage_manager.py        |  2 +-
 5 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/second-task/models/speech_to_text.py b/second-task/models/speech_to_text.py
index c02eacb..f7bb6f6 100644
--- a/second-task/models/speech_to_text.py
+++ b/second-task/models/speech_to_text.py
@@ -1,8 +1,10 @@
-import json
 from typing import List, Tuple
 
+import pandas as pd
 import speech_recognition as sr
-from pydub import AudioSegment  #
+from pydub import AudioSegment
+
+from models.labels_matcher import get_similar_label, EXPECTED_LABELS
 
 
 def divide_chunks(sound, chunk_size):
@@ -10,9 +12,10 @@ def divide_chunks(sound, chunk_size):
         yield sound[i:i + chunk_size]
 
 
-def process(file_path: str, chunk_size: int = 5000, apikey=None) -> List[Tuple[int, str]]:
+def process(file_path: str, chunk_size: int = 60000, apikey=None) -> Tuple[List[Tuple[int, str]], int]:
 
     sound = AudioSegment.from_mp3(file_path)
+    seconds = sound.duration_seconds
     chunks = list(divide_chunks(sound, chunk_size))
 
     r = sr.Recognizer()
@@ -24,13 +27,30 @@ def process(file_path: str, chunk_size: int = 5000, apikey=None) -> List[Tuple[i
 
         with sr.AudioFile(wav_file_path) as source:
             audio = r.record(source)
+
         if apikey:
             s = r.recognize_google_cloud(audio, language="pl-PL", credentials_json=apikey)
-            results.append((index*5, s))
+            words_per_second = len(s.split()) / 60
+            word_counter = 1
+            for word in s.split():
+                results.append((int((index * 60) + word_counter / words_per_second), word))
+                word_counter += 1
+            print(index)
         else:
             s = r.recognize_google(audio, language="pl-PL")
             results.append((index*5, s))
 
+    return results, seconds
+
+
+def assign_label(chunks: List[Tuple[int, str]], max_second) -> pd.DataFrame:
+    results = pd.DataFrame(data={'second': list(range(1, int(max_second)))})
+    results[EXPECTED_LABELS] = 0
+
+    for chunk in chunks:
+        labels = get_similar_label(chunk[1])
+        results.loc[results['second'] == chunk[0], labels] = 1
+
     return results
 
 
diff --git a/second-task/models/translator.py b/second-task/models/translator.py
index 367f94a..1bdaeb9 100644
--- a/second-task/models/translator.py
+++ b/second-task/models/translator.py
@@ -1,26 +1,25 @@
+from typing import List, Tuple
+
 from google.cloud import translate_v2
 from google.oauth2 import service_account
 
 
-def translate(text):
-    my_credentials = service_account.Credentials.from_service_account_file('apikey.json')
-    translate_client = translate_v2.Client(credentials=my_credentials)
+def translate(text, credentials):
+    translate_client = translate_v2.Client(credentials=credentials)
     result = translate_client.translate(text, target_language='en', source_language='pl')
-
-    # print(u"Text: {}".format(result["input"]))
-    # print(u"Translation: {}".format(result["translatedText"]))
     return result["translatedText"]
 
 
-def translate_list_of_chunks(chunks: list) -> list:
-    """ Accepts list of tuples like [(int, string), ...]"""
+def translate_list_of_chunks(chunks: List[Tuple[int, str]], apikey_path: str) -> List[Tuple[int, str]]:
     result_list = []
+    credentials = service_account.Credentials.from_service_account_file(apikey_path)
+
     for chunk in chunks:
-        translated = translate(chunk[1])
+        translated = translate(chunk[1], credentials)
         result_list.append((chunk[0], translated))
     return result_list
 
 
 if __name__ == '__main__':
     chunks_list = [(0, "Wszedł kotek na płotek. Lubię placki."), (1, "inny język"), (2, 'ostatnia partia')]
-    translate_list_of_chunks(chunks_list)
+    translate_list_of_chunks(chunks_list, 'apikey.json')
diff --git a/second-task/requirements.txt b/second-task/requirements.txt
index 081ea28..00c7115 100644
--- a/second-task/requirements.txt
+++ b/second-task/requirements.txt
@@ -28,4 +28,5 @@ moviepy==1.0.3
 opencv-python==4.4.0.46
 pandas==1.1.4
 grpcio
-grpcio-tools
\ No newline at end of file
+grpcio-tools
+google-cloud-translate==3.0.1
\ No newline at end of file
diff --git a/second-task/service/celery_tasks/celery_task.py b/second-task/service/celery_tasks/celery_task.py
index be73608..44fda70 100644
--- a/second-task/service/celery_tasks/celery_task.py
+++ b/second-task/service/celery_tasks/celery_task.py
@@ -1,9 +1,8 @@
-import json
-import os
-
 from google.oauth2 import service_account
 
 from models import speech_to_text, video_to_labels
+from models.speech_to_text import assign_label
+from models.translator import translate_list_of_chunks
 from service.api import celery
 from service.storage_manager import Storage, update_status, JobStatus
 
@@ -11,23 +10,23 @@
 @celery.task
 def process_speech_to_text(job_id: str):
     mp3_path = Storage.get_input_mp3_path(job_id).replace('../', './')
-    results = speech_to_text.process(mp3_path)
-    with open(mp3_path.replace('.mp3', '.json'), 'w+') as file:
-        json.dump(results, file)
-    # TODO: call pdf creation
+    labels, seconds = speech_to_text.process(mp3_path)
+    translated_labels = translate_list_of_chunks(labels, Storage.get_apikey_path())
+    results = assign_label(translated_labels, seconds)
+    results_path = Storage.get_results_path(job_id)
+    print(results_path)
+    print(results.head())
+    results.to_csv(results_path, index=False)
     update_status(job_id, JobStatus.finished)
 
 
 @celery.task
 def process_video_to_labels(job_id: str):
-
-    import os
-    print(os.path.dirname(os.path.realpath(__file__)))
-
     mp4_path = Storage.get_input_mp4_path(job_id).replace('../', './')
     credentials_path = Storage.get_apikey_path()
     credentials = service_account.Credentials.from_service_account_file(credentials_path)
     labels = video_to_labels.process(mp4_path, credentials)
-    results_path = os.path.join(Storage.get_job_root(job_id).replace('../', './'), 'results.csv')
+    results_path = Storage.get_results_path(job_id)
+    print(labels.head())
     labels.to_csv(results_path, index=False)
     update_status(job_id, JobStatus.finished)
diff --git a/second-task/service/storage_manager.py b/second-task/service/storage_manager.py
index 4878281..f8c6434 100644
--- a/second-task/service/storage_manager.py
+++ b/second-task/service/storage_manager.py
@@ -27,7 +27,7 @@ def get_job_root(job_id):
 
     @staticmethod
     def get_results_path(job_id):
-        return os.path.join('..', Storage.get_job_root(job_id), 'results.pdf')
+        return os.path.join(Storage.get_job_root(job_id), 'results.csv')
 
     @staticmethod
     def get_input_mp3_path(job_id):