|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
3,实验要求
(1)输入格式说明:
输入首先给出正整数N(<=100),为文件总数。随后按以下格式给出每个文件的内容:首先给出文件正文,最后在一行中只给出一个字符“#”,表示文件结束。在N个文件内容结束之后,给出查询总数M(<=104),随后M行,每行给出一对文件编号,其间以空格分隔。这里假设文件按给出的顺序从1到N编号。
(2)输出格式说明:
针对每一条查询,在一行中输出两文件的相似度,即两文件的公共词汇量占两文件总词汇量的百分比,精确到小数点后1位。注意这里的一个“单词”只包括仅由英文字母组成的、
长度不小于3、且不超过10的英文单词,长度超过10的只考虑前10个字母。单词间以任何非英文字母隔开。另外,大小写不同的同一单词被认为是相同的单词,例如“You”和“you”是同一个单词。
3)样例输入与输出:
输入
3
Aaa Bbb Ccc
#
Bbb Ccc Ddd
#
Aaa2 ccc Eee
is at Ddd@Fff
#
2
1 2
1 3
输出
50.0%
33.3%
下面是我自己写的代码:
- #include <iostream>
- #include <vector>
- #include <string>
- #include <iomanip>
- #include <algorithm>
- using namespace std;
- struct Document
- {
- int docID;
- vector<string> words;
- };
- class CompareFile
- {
- private:
- vector<Document> collection;
- int N,M;
- string s;
- public:
- void input()
- {
- cin>>N;
- int i = 0;
- string temp;
- while(true)
- {
- getline(cin, temp);
- s = s + temp;
- if(temp[0] == '#') i++;
- if(i == N)
- break;
- }
- for(i = 0; i < N; ++i)
- {
- collection[i].docID = i + 1;
- }
- }
- bool EqualWords(string s1,string s2)
- {
- string::size_type i;
- if(s1.size() != s2.size())
- return false;
- else
- {
- for(i = 0;i < s1.size(); ++i)
- {
- if(s1[i] != s2[i] && s1[i] != s2[i] + 32 && s1[i] != s2[i] - 32)
- return false;
- }
- }
- return true;
- }
- void indexDocument(int docID)
- {
- string::size_type n;
- int i;
- n = 0;
- for(i = 0; i < docID - 1; ++i)
- {
- n = s.find("#",n);
- n = n + 1;
- }
- string indexItem = "";
- while(s[n] != '#')
- {
- //读取单词,给索引项赋值
- while (isalpha(s[n]))
- {
- indexItem += s[n];
- n++;
- }
- if(indexItem.size() < 3)
- continue;
- if(indexItem.size() >= 10)
- indexItem = indexItem.substr(0,10);
- //把索引项加入词典
- collection[docID - 1].words.push_back(indexItem);
- //清空索引项,准备下一次
- indexItem = "";
- n++;
- }
- }
- void indexCollection()
- {
- //打开对应的文件并索引
- for (int i = 0; i < N; i++)
- {
- //索引单篇文档
- indexDocument(collection[i].docID);
- }
- }
- static bool cmp(string a, string b)
- {
- return a < b;//词项按照从小到大排序
- }
- void sortIndex(Document doc)
- {
- sort(doc.words.begin(), doc.words.end(), cmp);
- }
- void mergeIndex()
- {
- int j;
- for(j = 0; j < N; ++j)
- {
- sortIndex(collection[j]);
- }
- vector<string>::iterator it_cur;//创建迭代器
- vector<string>::iterator it_next;
- vector<Document>::iterator i = collection.begin();
- while(i != collection.end())
- {
- it_cur = (*i).words.begin();
- it_next = it_cur + 1;
- while (it_cur != (*i).words.end())
- {
- if(it_cur + 1 != (*i).words.end()) it_next = it_cur + 1;
- else break;
- while(EqualWords((*it_cur),(*it_next)))
- {//这个循环内处理掉所有与当前词项重复的词项
- (*i).words.erase(it_next);//删除重复项
- if (it_cur + 1 != (*i).words.end()) it_next = it_cur + 1;
- else break;
- }
- it_cur++;
- }
- i++;
- }
- }
- void Compare(int docID1,int docID2)
- {
- int a = docID1 - 1 ,b = docID2 - 1;
- int Count = 0,sum;
- sum = collection[a].words.size() + collection[b].words.size();
- vector<string>::iterator it_cur1;//创建迭代器
- vector<string>::iterator it_cur2;
- it_cur1 = collection[a].words.begin();
- it_cur2 = collection[b].words.begin();
- while(it_cur1 != collection[a].words.end())
- {
- while(it_cur2 != collection[b].words.end())
- {
- if(EqualWords((*it_cur1),(*it_cur2)))
- {
- Count++;
- sum--;
- break;
- }
- it_cur2++;
- }
- it_cur1++;
- }
- double r = Count/sum * 100;
- cout<<setiosflags(ios::fixed)<<setprecision(1)<<r<<"%"<<endl;
- }
- //~CompareFile();
- };
- int main()
- {
- CompareFile t;
- t.input();
- t.indexCollection();
- t.mergeIndex();
- t.Compare(1,2);
- //for(int i = 0;i < N;++i)
- // cout<<s[i];
- //int a,b;
- //cin>>a>>b;
- //t.Compare(s,a,b);
- return 0;
- }
复制代码
不知道为啥运行没反应
本帖最后由 jhq999 于 2021-11-17 16:37 编辑
- #include <stdio.h>
- #include <iostream>
- #include <string>
- using namespace std;
- #define STRINGMAXSIZE 64
- int wordsum(string *instr,string *words=NULL,int count=0)//只输入一个参数时,返回字符串单词数量,否则分离出单词
- {
- int i = 0;
- int start=0;
- if (NULL==words)
- {
- for (i=0;instr->c_str()[i]; i++)
- {
- if (('a'>instr->c_str()[i]||'z'<instr->c_str()[i])&&('A'>instr->c_str()[i]||'Z'<instr->c_str()[i]))
- {
- if (0x20!=instr->c_str()[i])
- {
- *instr=instr->replace(i,1," ");//把不是字母的都替换成空格
- }
- if (instr->substr(start,i-start).c_str()[0]&&(0x20!=instr->substr(start,i-start).c_str()[0]))//如果子字符串开头不是0和空格单词数量加一
- {
- count++;
- }
- start=i+1;
- }
- }
- return count;
- }
- else if (count)//分离出单词
- {
- for (i=0;instr->c_str()[i]; i++)
- {
- if (0x20==instr->c_str()[i])
- {
- if (instr->substr(start,i-start).c_str()[0]&&0x20!=instr->substr(start,i-start).c_str()[0])
- {
-
- words[--count]=instr->substr(start,i-start);
- if (words[count].length()>10)//大于10的单词取10个字符
- {
- words[count]=words[count].substr(0,10);
- }
- }
- start=i+1;
- }
- }
- }
- return count;
- }
- double cmpword(string *words1,string *words2,int word1count,int word2count)
- {
- double count=0;
- bool flag=0;
- for (int i = 0; i < word1count; i++)
- {
- if (words1[i].length()<3)continue;//小于3丢掉
-
- flag=0;
- for (int j = 0; j <word2count; j++)
- {
- if (words2[j].length()<3)continue;//小于3丢掉
- if (!_stricmp(words1[i].c_str(),words2[j].c_str()))//不区分大小写比对
- {
- if (0==flag)
- {
- count++;
- flag=1;
- }
- count++;
- cout<<words1[i]<<endl;
- }
- }
- }
- return (count*100)/(double)(word1count+word2count);
- }
- int cmpstr(string *strs,int n,int (*team)[2],int m)
- {
- int i=0,j=0,*wordcount=new int[n];
- string **word=new string*[n];
- for (i = 0; i < n; i++)
- {
- wordcount[i]=wordsum(&strs[i]);
- word[i]=new string[wordcount[i]];
- wordsum(&strs[i],word[i],wordcount[i]);
- }
- for (i = 0; i < m; i++)
- {
- //cout<<setprecision(1)<<cmpword(word[team[i][0]-1],word[team[i][1]-1],wordcount[team[i][0]-1],wordcount[team[i][1]-1])<<endl;
- printf("%.1lf%%\n",cmpword(word[team[i][0]-1],word[team[i][1]-1],wordcount[team[i][0]-1],wordcount[team[i][1]-1]));
- }
- for (i = 0; i < n; i++)
- {
- delete[] word[i];
- }
- delete[] wordcount;
- delete[] word;
- return 0;
- }
- int main()
- {
- int i=0,j=0,m=0,n=0;
- cin>>n;
- fflush(stdin);
- string *strs=new string[n];
- for (i = 0; i <n; i++)
- {
- strs[i].resize(STRINGMAXSIZE);
- scanf("%[^#]",strs[i].data());
- fflush(stdin);
- }
- cin>>m;
- fflush(stdin);
- int (*team)[2]=(int(*)[2])(new int[2*m*4]);
- for (i = 0; i < m; i++)
- {
- cin>>team[i][0]>>team[i][1];
- }
- cmpstr(strs,n,team,m);
- delete[] team;
- delete[] strs;
- return 0;
- }
复制代码- 3
- Aaa Bbb Ccc ABCDEFGHIJ
- #
- Bbb Ccc Ddd
- #
- Aaa2 ccc Eee
- is at Ddd@Fff
- abcdefghijklmn
- #
- 3
- 1 2
- 1 3
- 2 3
- Ccc
- Bbb
- 57.1%
- ABCDEFGHIJ
- Ccc
- Aaa
- 50.0%
- Ddd
- Ccc
- 36.4%
复制代码
|
|