-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetSimilarity.cpp
More file actions
91 lines (88 loc) · 2.83 KB
/
GetSimilarity.cpp
File metadata and controls
91 lines (88 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include "GetSimilarity.h"
void GetSimilarity::readToMap() {
unordered_map<wstring, double> *p = nullptr;
if(timer == true)
cout <<"Reading file and count words started at:" << clock() << "ms" << endl;
int cwords = 0;
int cfiles = 0;
wstring wstr;
while (getline(winfile, wstr)) {
if (wstr.size() == 0)
continue;
auto p1 = wstr.begin(), p2 = p1;
if (wstr[16] == '0'&&wstr[17] == '0'&&wstr[18] == '1') {//如果第16,17,18个字符为001,则表示新开始一篇文章
if (tf.size() != 0) {
for (auto &it : **tf.rbegin())
it.second /= cwords;//前一个map中的每个value除以文章中的总词数.
cwords = 0;
}
textname.push_back(string(wstr.begin(), wstr.begin() + 15));//记录文章标题.
p = new unordered_map<wstring, double>;//为每篇文章创建一个新的map.
tf.push_back(p);//指针放在tf vector中.
}
while (1) {
while (*(p1++) != ' ');
p2 = ++p1;
if (p1 == wstr.end())
break;
while (*(++p2) != ' ');
p2--;
if (*p2 != 'r'&&*p2 != 'p'&&*p2 != 'c'&&*p2 != 'u' \
&&*p2 != 'y'&&*p2 != 'e'&&*p2 != 'o'&&*p2 != 'w'&&*p2 != 'q') {//去除无效词.
while (*(--p2) != '/');
if (p->insert({ wstring(p1, p2),1 }).second == 0)//尝试插入到tf的map中,如果失败,
p->find(wstring(p1, p2))->second++;//则表明key以存在,则value++.
else//如果插入成功,则表示该关键词在这篇文章中第一次出现,
if (idf.insert({ wstring(p1, p2),1 }).second == 0)//此时尝试插入到idf map中.如果该尝试失败,
idf.find(wstring(p1, p2))->second++;//则表示这一关键词曾在其他文章中出现过,value++即可.
cwords++;
p1 = p2;
}
}
}
auto it = *tf.rbegin();
for (auto it2 = it->begin(); it2 != it->end(); it2++)
it2->second /= cwords;
cwords = 0;
cfiles = tf.size();
for (auto &i : idf) {
i.second = cfiles / i.second;
i.second = log(i.second);
}
for (auto &i : tf)
for (auto &j : *i)
j.second *= idf.find(j.first)->second;
}
void GetSimilarity::culculateSim() {
if (timer == true)
cout << "Culculate similarity and output file started at:" << clock() << "ms" << endl;
double InnerProduct = 0;
double l1 = 0, l2 = 0;
double answer;
auto itname = textname.begin();
for (auto it1 = tf.begin(); it1 != tf.end(); it1++) {
outfile << *itname << ":";
itname++;
for (auto it2 = tf.begin(); it2 != it1 + 1; it2++) {
for (auto &word : **it1) {
if ((**it2).find(word.first) != (**it2).end())
InnerProduct += word.second * (**it2).find(word.first)->second;//计算每两篇文章权重的内积.
l1 += word.second * word.second;//计算每个权重的平方.
}
for (auto &word : **it2)
l2 += word.second*word.second;//计算每个权重的平方.
answer = InnerProduct / (sqrt(l1) * sqrt(l2));//计算相关度.
InnerProduct = 0;
l1 = 0;
l2 = 0;
outfile << answer << ' ';//输出相关度.
}
outfile << endl;
}
if (timer == true)
cout << "Finish at:" << clock() << "ms" << endl;
}
void GetSimilarity::operator()() {
readToMap();
culculateSim();
}