-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhelper.py
More file actions
110 lines (67 loc) · 2.74 KB
/
helper.py
File metadata and controls
110 lines (67 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import torch
import pandas as pd
import fasttext
import math
import numpy as np
def init_embedding(fasttext_embed_file):
"""
Initializes fast text word embedding matrix
"""
df_embedding = pd.read_csv(fasttext_embed_file, sep=" ", quoting=3, header=None, index_col=0,skiprows=1)
embedding_matrix = df_embedding.to_numpy()
vocab = []
for word in list(df_embedding.index):
vocab.append(str(word))
vocab_size , vocab_dim = embedding_matrix.shape
word2idx = {w: idx for (idx, w) in enumerate(vocab)}
idx2word = {idx: w for (idx, w) in enumerate(vocab)}
return embedding_matrix, vocab_size, vocab_dim, word2idx
def tokenized_tensor(data, word2idx):
"""
Returns tokenized tensor vector for input text=
"""
output_tokenized = []
for sentence in data:
output = []
tokenized = fasttext.tokenize(sentence)
for word in tokenized:
if word in word2idx:
id = word2idx[word]
output.append(id)
else:
word2idx[word] = len(word2idx)
id = word2idx[word]
output.append(id)
output = torch.tensor(output)
output_tokenized.append(output)
return output_tokenized, word2idx
def get_data_sequences(train_dir, valid_dir, test_dir, word2idx):
"""
Returns tokenized tensor sequences for the dataset
"""
train_df = pd.read_csv(train_dir)
val_df = pd.read_csv(valid_dir)
test_df = pd.read_csv(test_dir)
train_data = train_df['tweets'].values
train_labels = train_df['labels'].values
val_data = val_df['tweets'].values
val_labels = val_df['labels'].values
test_data = test_df['tweets'].values
test_labels = test_df['labels'].values
train_tokenized_sequences, word2idx = tokenized_tensor(train_data, word2idx)
test_tokenized_seqences, word2idx = tokenized_tensor(test_data, word2idx)
val_tokenized_sequences, word2idx = tokenized_tensor(val_data, word2idx)
return train_tokenized_sequences, train_labels, test_tokenized_seqences, test_labels, val_tokenized_sequences, val_labels, word2idx
## Create final embedding matrix
def create_embedding(embedding_matrix, word2idx, vocab_size, vocab_dim):
"""
Returns complete fasttext embedding matrix
"""
word2idx['<PAD>'] = len(word2idx)
random_init = torch.nn.Parameter(torch.Tensor( (len(word2idx) - vocab_size), vocab_dim))
torch.nn.init.kaiming_uniform_(random_init, a=math.sqrt(5))
new_matrix = np.zeros( (len(word2idx), vocab_dim) )
new_matrix[:vocab_size, :] = embedding_matrix
embedding_matrix = new_matrix
embedding_matrix[vocab_size:, :] = random_init.detach().numpy()
return embedding_matrix, word2idx