[已解决]C++倒排索引表问题求解

Dogelike · 发表于 2021-11-15 21:19:29

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

3,实验要求
(1)输入格式说明:
输入首先给出正整数N(<=100)，为文件总数。随后按以下格式给出每个文件的内容:首先给出文件正文，最后在一行中只给出一个字符“#”,表示文件结束。在N个文件内容结束之后，给出查询总数M(<=104)，随后M行,每行给出一对文件编号，其间以空格分隔。这里假设文件按给出的顺序从1到N编号。
(2)输出格式说明:
针对每一条查询，在一行中输出两文件的相似度,即两文件的公共词汇量占两文件总词汇量的百分比,精确到小数点后1位。注意这里的一个“单词”只包括仅由英文字母组成的、
长度不小于3、且不超过10的英文单词,长度超过10的只考虑前10个字母。单词间以任何非英文字母隔开。另外,大小写不同的同一单词被认为是相同的单词,例如“You”和“you”是同一个单词。
3)样例输入与输出:
输入
3
Aaa Bbb Ccc
#
Bbb Ccc Ddd
#
Aaa2 ccc Eee
is at Ddd@Fff
#
2
1 2
1 3
输出
50.0%
33.3%

下面是我自己写的代码：

#include <iostream>
#include <vector>
#include <string>
#include <iomanip>
#include <algorithm>
using namespace std;
struct Document
{
int docID;
vector<string> words;
};
class CompareFile
{
private:
vector<Document> collection;
int N,M;
string s;
public:
void input()
{
cin>>N;
int i = 0;
string temp;
while(true)
{
getline(cin, temp);
s = s + temp;
if(temp[0] == '#') i++;
if(i == N)
break;
}
for(i = 0; i < N; ++i)
{
collection[i].docID = i + 1;
}
}
bool EqualWords(string s1,string s2)
{
string::size_type i;
if(s1.size() != s2.size())
return false;
else
{
for(i = 0;i < s1.size(); ++i)
{
if(s1[i] != s2[i] && s1[i] != s2[i] + 32 && s1[i] != s2[i] - 32)
return false;
}
}
return true;
}
void indexDocument(int docID)
{
string::size_type n;
int i;
n = 0;
for(i = 0; i < docID - 1; ++i)
{
n = s.find("#",n);
n = n + 1;
}
string indexItem = "";
while(s[n] != '#')
{
//读取单词，给索引项赋值
while (isalpha(s[n]))
{
indexItem += s[n];
n++;
}
if(indexItem.size() < 3)
continue;
if(indexItem.size() >= 10)
indexItem = indexItem.substr(0,10);
//把索引项加入词典
collection[docID - 1].words.push_back(indexItem);
//清空索引项，准备下一次
indexItem = "";
n++;
}
}
void indexCollection()
{
//打开对应的文件并索引
for (int i = 0; i < N; i++)
{
//索引单篇文档
indexDocument(collection[i].docID);
}
}
static bool cmp(string a, string b)
{
return a < b;//词项按照从小到大排序
}
void sortIndex(Document doc)
{
sort(doc.words.begin(), doc.words.end(), cmp);
}
void mergeIndex()
{
int j;
for(j = 0; j < N; ++j)
{
sortIndex(collection[j]);
}
vector<string>::iterator it_cur;//创建迭代器
vector<string>::iterator it_next;
vector<Document>::iterator i = collection.begin();
while(i != collection.end())
{
it_cur = (*i).words.begin();
it_next = it_cur + 1;
while (it_cur != (*i).words.end())
{
if(it_cur + 1 != (*i).words.end()) it_next = it_cur + 1;
else break;
while(EqualWords((*it_cur),(*it_next)))
{//这个循环内处理掉所有与当前词项重复的词项
(*i).words.erase(it_next);//删除重复项
if (it_cur + 1 != (*i).words.end()) it_next = it_cur + 1;
else break;
}
it_cur++;
}
i++;
}
}
void Compare(int docID1,int docID2)
{
int a = docID1 - 1 ,b = docID2 - 1;
int Count = 0,sum;
sum = collection[a].words.size() + collection[b].words.size();
vector<string>::iterator it_cur1;//创建迭代器
vector<string>::iterator it_cur2;
it_cur1 = collection[a].words.begin();
it_cur2 = collection[b].words.begin();
while(it_cur1 != collection[a].words.end())
{
while(it_cur2 != collection[b].words.end())
{
if(EqualWords((*it_cur1),(*it_cur2)))
{
Count++;
sum--;
break;
}
it_cur2++;
}
it_cur1++;
}
double r = Count/sum * 100;
cout<<setiosflags(ios::fixed)<<setprecision(1)<<r<<"%"<<endl;
}
//~CompareFile();
};
int main()
{
CompareFile t;
t.input();
t.indexCollection();
t.mergeIndex();
t.Compare(1,2);
//for(int i = 0;i < N;++i)
// cout<<s[i];
//int a,b;
//cin>>a>>b;
//t.Compare(s,a,b);
return 0;
}

复制代码

不知道为啥运行没反应

最佳答案

月排行榜 / 总排行榜

jhq999

2021-11-17 16:16:29

本帖最后由 jhq999 于 2021-11-17 16:37 编辑

#include <stdio.h>
#include <iostream>
#include <string>
using namespace std;
#define STRINGMAXSIZE 64
int wordsum(string *instr,string *words=NULL,int count=0)//只输入一个参数时，返回字符串单词数量，否则分离出单词
{
int i = 0;
int start=0;
if (NULL==words)
{
for (i=0;instr->c_str()[i]; i++)
{
if (('a'>instr->c_str()[i]||'z'<instr->c_str()[i])&&('A'>instr->c_str()[i]||'Z'<instr->c_str()[i]))
{
if (0x20!=instr->c_str()[i])
{
*instr=instr->replace(i,1," ");//把不是字母的都替换成空格
}
if (instr->substr(start,i-start).c_str()[0]&&(0x20!=instr->substr(start,i-start).c_str()[0]))//如果子字符串开头不是0和空格单词数量加一
{
count++;
}
start=i+1;
}
}
return count;
}
else if (count)//分离出单词
{
for (i=0;instr->c_str()[i]; i++)
{
if (0x20==instr->c_str()[i])
{
if (instr->substr(start,i-start).c_str()[0]&&0x20!=instr->substr(start,i-start).c_str()[0])
{
words[--count]=instr->substr(start,i-start);
if (words[count].length()>10)//大于10的单词取10个字符
{
words[count]=words[count].substr(0,10);
}
}
start=i+1;
}
}
}
return count;
}
double cmpword(string *words1,string *words2,int word1count,int word2count)
{
double count=0;
bool flag=0;
for (int i = 0; i < word1count; i++)
{
if (words1[i].length()<3)continue;//小于3丢掉
flag=0;
for (int j = 0; j <word2count; j++)
{
if (words2[j].length()<3)continue;//小于3丢掉
if (!_stricmp(words1[i].c_str(),words2[j].c_str()))//不区分大小写比对
{
if (0==flag)
{
count++;
flag=1;
}
count++;
cout<<words1[i]<<endl;
}
}
}
return (count*100)/(double)(word1count+word2count);
}
int cmpstr(string *strs,int n,int (*team)[2],int m)
{
int i=0,j=0,*wordcount=new int[n];
string **word=new string*[n];
for (i = 0; i < n; i++)
{
wordcount[i]=wordsum(&strs[i]);
word[i]=new string[wordcount[i]];
wordsum(&strs[i],word[i],wordcount[i]);
}
for (i = 0; i < m; i++)
{
//cout<<setprecision(1)<<cmpword(word[team[i][0]-1],word[team[i][1]-1],wordcount[team[i][0]-1],wordcount[team[i][1]-1])<<endl;
printf("%.1lf%%\n",cmpword(word[team[i][0]-1],word[team[i][1]-1],wordcount[team[i][0]-1],wordcount[team[i][1]-1]));
}
for (i = 0; i < n; i++)
{
delete[] word[i];
}
delete[] wordcount;
delete[] word;
return 0;
}
int main()
{
int i=0,j=0,m=0,n=0;
cin>>n;
fflush(stdin);
string *strs=new string[n];
for (i = 0; i <n; i++)
{
strs[i].resize(STRINGMAXSIZE);
scanf("%[^#]",strs[i].data());
fflush(stdin);
}
cin>>m;
fflush(stdin);
int (*team)[2]=(int(*)[2])(new int[2*m*4]);
for (i = 0; i < m; i++)
{
cin>>team[i][0]>>team[i][1];
}
cmpstr(strs,n,team,m);
delete[] team;
delete[] strs;
return 0;
}

复制代码

3
Aaa Bbb Ccc ABCDEFGHIJ
#
Bbb Ccc Ddd
#
Aaa2 ccc Eee
is at Ddd@Fff
abcdefghijklmn
#
3
1 2
1 3
2 3
Ccc
Bbb
57.1%
ABCDEFGHIJ
Ccc
Aaa
50.0%
Ddd
Ccc
36.4%

复制代码

跳转到最佳答案楼层

傻眼貓咪 · 发表于 2021-11-15 21:27:17

这题很不错，有挑战性

Dogelike · 发表于 2021-11-16 13:14:04

傻眼貓咪发表于 2021-11-15 21:27
这题很不错，有挑战性

做出来了吗哥

傻眼貓咪 · 发表于 2021-11-16 17:48:53

Dogelike 发表于 2021-11-16 13:14
做出来了吗哥

抱歉，我试了，但是不能，可能需要其他大佬看看了

人造人 · 发表于 2021-11-16 17:52:43

Dogelike 发表于 2021-11-16 13:14
做出来了吗哥

33.3% 是怎么算出来的？

jhq999 · 发表于 2021-11-17 16:16:29

这个最佳答案由 jhq999 给出，感谢 jhq999 的回答。

单击隐藏图章

本帖最后由 jhq999 于 2021-11-17 16:37 编辑

#include <stdio.h>
#include <iostream>
#include <string>
using namespace std;
#define STRINGMAXSIZE 64
int wordsum(string *instr,string *words=NULL,int count=0)//只输入一个参数时，返回字符串单词数量，否则分离出单词
{
int i = 0;
int start=0;
if (NULL==words)
{
for (i=0;instr->c_str()[i]; i++)
{
if (('a'>instr->c_str()[i]||'z'<instr->c_str()[i])&&('A'>instr->c_str()[i]||'Z'<instr->c_str()[i]))
{
if (0x20!=instr->c_str()[i])
{
*instr=instr->replace(i,1," ");//把不是字母的都替换成空格
}
if (instr->substr(start,i-start).c_str()[0]&&(0x20!=instr->substr(start,i-start).c_str()[0]))//如果子字符串开头不是0和空格单词数量加一
{
count++;
}
start=i+1;
}
}
return count;
}
else if (count)//分离出单词
{
for (i=0;instr->c_str()[i]; i++)
{
if (0x20==instr->c_str()[i])
{
if (instr->substr(start,i-start).c_str()[0]&&0x20!=instr->substr(start,i-start).c_str()[0])
{
words[--count]=instr->substr(start,i-start);
if (words[count].length()>10)//大于10的单词取10个字符
{
words[count]=words[count].substr(0,10);
}
}
start=i+1;
}
}
}
return count;
}
double cmpword(string *words1,string *words2,int word1count,int word2count)
{
double count=0;
bool flag=0;
for (int i = 0; i < word1count; i++)
{
if (words1[i].length()<3)continue;//小于3丢掉
flag=0;
for (int j = 0; j <word2count; j++)
{
if (words2[j].length()<3)continue;//小于3丢掉
if (!_stricmp(words1[i].c_str(),words2[j].c_str()))//不区分大小写比对
{
if (0==flag)
{
count++;
flag=1;
}
count++;
cout<<words1[i]<<endl;
}
}
}
return (count*100)/(double)(word1count+word2count);
}
int cmpstr(string *strs,int n,int (*team)[2],int m)
{
int i=0,j=0,*wordcount=new int[n];
string **word=new string*[n];
for (i = 0; i < n; i++)
{
wordcount[i]=wordsum(&strs[i]);
word[i]=new string[wordcount[i]];
wordsum(&strs[i],word[i],wordcount[i]);
}
for (i = 0; i < m; i++)
{
//cout<<setprecision(1)<<cmpword(word[team[i][0]-1],word[team[i][1]-1],wordcount[team[i][0]-1],wordcount[team[i][1]-1])<<endl;
printf("%.1lf%%\n",cmpword(word[team[i][0]-1],word[team[i][1]-1],wordcount[team[i][0]-1],wordcount[team[i][1]-1]));
}
for (i = 0; i < n; i++)
{
delete[] word[i];
}
delete[] wordcount;
delete[] word;
return 0;
}
int main()
{
int i=0,j=0,m=0,n=0;
cin>>n;
fflush(stdin);
string *strs=new string[n];
for (i = 0; i <n; i++)
{
strs[i].resize(STRINGMAXSIZE);
scanf("%[^#]",strs[i].data());
fflush(stdin);
}
cin>>m;
fflush(stdin);
int (*team)[2]=(int(*)[2])(new int[2*m*4]);
for (i = 0; i < m; i++)
{
cin>>team[i][0]>>team[i][1];
}
cmpstr(strs,n,team,m);
delete[] team;
delete[] strs;
return 0;
}

复制代码

3
Aaa Bbb Ccc ABCDEFGHIJ
#
Bbb Ccc Ddd
#
Aaa2 ccc Eee
is at Ddd@Fff
abcdefghijklmn
#
3
1 2
1 3
2 3
Ccc
Bbb
57.1%
ABCDEFGHIJ
Ccc
Aaa
50.0%
Ddd
Ccc
36.4%

复制代码

Dogelike · 发表于 2021-11-21 19:02:59

人造人发表于 2021-11-16 17:52
33.3% 是怎么算出来的？

一共有六个词（重复算一次），其中AAA,CCC重复，is和at长度不过二不算

Dogelike · 发表于 2021-11-21 19:03:59

jhq999 发表于 2021-11-17 16:16

牛哇大佬

人造人 · 发表于 2021-11-21 19:08:04

Dogelike 发表于 2021-11-21 19:02
一共有六个词（重复算一次），其中AAA,CCC重复，is和at长度不过二不算

嗯，明白了

Dogelike · 发表于 2021-11-21 19:34:25

jhq999 发表于 2021-11-17 16:16

大佬你的程序应该词数记错了，输出结果和我放的不一样

账号		自动登录	找回密码
密码			立即注册