-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimplement.py
More file actions
25 lines (22 loc) · 829 Bytes
/
implement.py
File metadata and controls
25 lines (22 loc) · 829 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
with open("tweet_with_url.txt", 'r') as data:
text = [line.strip() for line in data]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text)
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
file = open('words_per_cluster.txt', 'a')
for i in range(true_k):
print ("Cluster %d:" % i,)
file.write("Cluster %d:" % i +'\n')
file.write('\n'+ '\n')
for ind in order_centroids[i, :10]:
print (' %s' % terms[ind],)
file.write(terms[ind]+'\n')
file.write('\n')