From 8d1c59e8fa441f00c274d85d773910cd2c924ce4 Mon Sep 17 00:00:00 2001 From: Samuel Vasco Gonzalez <65866495+samuVG@users.noreply.github.com> Date: Sat, 9 Jul 2022 19:32:58 -0500 Subject: [PATCH] Homework 2 --- .../1152223665/homework_01/babynames.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 EXTRAS/homeworks_to_submit/1152223665/homework_01/babynames.py diff --git a/EXTRAS/homeworks_to_submit/1152223665/homework_01/babynames.py b/EXTRAS/homeworks_to_submit/1152223665/homework_01/babynames.py new file mode 100644 index 0000000..52be52b --- /dev/null +++ b/EXTRAS/homeworks_to_submit/1152223665/homework_01/babynames.py @@ -0,0 +1,75 @@ +import numpy as np +import pandas as pd +import codecs #Library for open HTML file +import re +import sys + + + +def extract_names(filename): + #we open the file as string + file=codecs.open(filename,"r", "utf-8").read() + #we use RegEx for find year + year=re.findall(r"Popularity in (\d{4})",file) + #we use RegEx for find names and ranks + rank_names=re.findall(r"(\d+)(\S+)(\S+)",file) + + #we save year and rank of names in a list + Data= [] + + Data.append(year[0]) + for i in range(len(rank_names)): + Data.append(rank_names[i][1]+" "+rank_names[i][0]) + Data.append(rank_names[i][2]+" "+rank_names[i][0]) + + Data.sort() #we order the list of names alphabetically + + return Data + + + +def main(): + # This command-line parsing code is provided. + # Make a list of command line arguments, omitting the [0] element + # which is the script itself. + args = sys.argv[1:] + + if not args: + print('usage: [--summaryfile] file [file ...]') + sys.exit(1) + + + # Notice the summary flag and remove it from args if it is present. + summary = False + if args[0] == '--summaryfile': + summary = True + del args[0] + + #we create variables where we saved the columns of data frame and the matrix + #with the data called rows + columns=[] + rows=[] + + #here we fill the previous variables + for i in args: + data=extract_names(i) + columns.append(data[0]) + data.pop(0) + rows.append(data) + + rows=np.array(rows).transpose() + + #here we create the data frame and we save the file + DF=pd.DataFrame(data=rows, columns=columns) + DF.to_csv("DataFrame_Babynames.csv") + + #if the flag summary is true then we save a text file per every html file + #with the summary of baby names and his respective rank + if summary: + for i in args: + f=open(i+".summary.txt", 'w') + f.write('\n'.join(extract_names(i)) + '\n') + f.close() + +if __name__ == '__main__': + main() \ No newline at end of file