1. 文本摘要生成(综合)



  1. 首先分析文本中非停用词(stop-word)的出现频度;

  2. 统计文本中每个句子中非停用词频度之和。若某个非停用词在一个句子中出现多次,则都要计算;

  3. 按非停用词频度之和由高至低输出前N个句子。


  • 单词为仅由字母组成的字符序列。包含大写字母的单词应将大写字母转换为小写字母后进行词频统计。

  • 句子是由下面符号分隔的段落:句号(.)、问号(?)和惊叹号(!)。

  • 在自然语言处理中,停用词(stop-word)指的是文本分析时不会提供额外语义信息的词的列表,如英文单词a,an,he,you等就是停用词。











50286 James’s eyes were hazel, his nose was slightly longer than Harry’s and there was no scar on his forehead, but they had the same thin face, same mouth, same eyebrows; James’s hair stuck up at the back exactly as Harry’s did, his hands could have been Harry’s and Harry could tell that, when James stood up, they would be within an inch of each other in height.

48188 I didn’t practise, I didn’t bother, I could’ve stopped myself having those dreams, Hermione kept telling me to do it, if I had he’d never have been able to show me where to go, and-Sirius wouldn’t-Sirius wouldn’t-'Something was erupting inside Harry’s head: a need to justify himself, to explain-I tried to check he’d really taken Sirius, I went to Umbridge’s office, I spoke to Kreacher in the fire and he said Sirius wasn’t there, he said he’d gone!

39986 Little Ginny’s been writing in it for months and months, telling me all her pitiful worries and woes - how her brothers tease her, how she had to come to school with secondhand robes and books, how - Riddle’s eyes glinted - how she didn’t think famous, good, great Harry Potter would ever like herAll the time he spoke, Riddle’s eyes never left Harry’s face.

39455 I mean, it was really great of you and everything,’ said Hermione quickly, looking positively petrified at the look on Harry’s face, everyone thought it was a wonderful thing to do-‘That’s funny,’ said Harry through gritted teeth, because I definitely remember Ron saying I’d wasted time acting the hero .

39438 Harry, Ginny and Neville and each of the Death Eaters turned in spite of themselves to watch the top of the tank as a brain burst from the green liquid like a leaping fish: for a moment it seemed suspended in midair, then it soared towards Ron, spinning as it came, and what looked like ribbons of moving images flew from it, unravelling like rolls of film-Ha ha ha, Harry, look at it-’ said Ron, watching it disgorge its gaudy innards, Harry, come and touch it; bet it’s weird-'RON, NO!








二维数组形式的 字典树 + 快排



https://www.jq22.com/textDifference 和


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAXWORD  25
#define MAXSENTENCE 215000
#define MAXSENTENCEWORD 1500int getword(FILE *bfp,char word[]);//读入一个单词void insert_stopwords(char *str);
int search_stopwords(char *str);void insert_articlewords(char *str);
int search_articlewords(char *str);struct Sentence{char sentence[MAXSENTENCEWORD];int num;
typedef struct Sentence st;int getsentence(FILE *bfp,struct Sentence *item);int cmp(const void* a, const void* b)
{st* pa = (st*)a;st* pb = (st*)b;return (int)pb->num - pa->num;   //  从大到小
}int main(int argc, const char * argv[]) {/*首先分析文本中非停用词(stop-word)的出现频度;统计文本中每个句子中非停用词频度之和。若某个非停用词在一个句子中出现多次,则都要计算;按非停用词频度之和由高至低输出前N个句子。注:大写字母转换为小写字母后进行词频统计句子是由 句号(.) 问号(?) 惊叹号(!) 分隔*///根据当前目录下停用词文件“stopwords.txt”,打开当前目录下文件“article.txt”,并从标准输入读入需要生成至文件的句子数N。FILE *file_stopwords,*file_article;if((file_stopwords = fopen("stopwords.txt", "r")) == NULL){ //打开停用词文件fprintf(stderr, "stopwords.txt can’t open!\n");return -1;}if((file_article = fopen("article.txt", "r")) == NULL){ //打开文章文件fprintf(stderr, "article.txt can’t open!\n");return -1;}int N;scanf("%d",&N); //需要生成至文件的句子数Nchar word[MAXWORD];while( fscanf(file_stopwords,"%s",word) != EOF) //从文件中读入停用词insert_stopwords(word);while( getword(file_article,word) != EOF) //从文件中读入一个单词if (!search_stopwords(word))insert_articlewords(word);fclose(file_stopwords);if((file_article = fopen("article.txt", "r")) == NULL){ //再次打开文章文件fprintf(stderr, "article.txt can’t open!\n");return -1;}int i=0;while( getsentence(file_article,&sentence[i]) != EOF) i++;/* 对结构体进行快排,⚠️请使用<stdlib.h>头文件自带的qsort(),否则会导致排序不稳定(即两个句子频度和相同,不按原文本中出现次序输出)。进一步了解该函数可参考“CSDN“文章https://blog.csdn.net/qq_16933601/article/details/107214404 或 https://blog.csdn.net/qq_38789531/article/details/94358602 */qsort(sentence,i,sizeof(sentence[0]),cmp);//在标准输出上按频度之和由高至低输出前5个句子的频度之和与句子。输出时先输出句子的频度和,然后空一个空格再输出整个句子,每个句子最后有一个回车。同时按频度之和由高至低输出前N个句子的频度之和与句子输出到文件“results.txt”中,输出要求同标准输出。输出时,若两个句子频度和相同,则按原文本中出现次序输出。FILE *file_results;file_results = fopen("results.txt", "w");for(int i=0;i<5;i++)printf("%d %s\n",sentence[i].num,sentence[i].sentence);for(int i=0;i<N;i++)fprintf(file_results,"%d %s\n",sentence[i].num,sentence[i].sentence);return 0;
int trie_stopwords[10010][26]={0};
int num_stopwords[10010]={0};
int pos_stopwords=0;void insert_stopwords(char *str)
{int p=0;for(int i=0; str[i]; i++){int n=str[i]-'a';if(trie_stopwords[p][n]==0)trie_stopwords[p][n]=++pos_stopwords;p=trie_stopwords[p][n];}num_stopwords[p]=1; //在p结尾的单词数量+1 单词唯一
}int search_stopwords(char *str)
{int p=0;for(int i=0;str[i];i++){int n=str[i]-'a';if(trie_stopwords[p][n]==0) return 0;//不存在p=trie_stopwords[p][n];}return num_stopwords[p]==1;
}int getword(FILE *bfp,char word[])
{int i=0;char temp;while((temp=fgetc(bfp))!=EOF){if(isalpha(temp)){word[i]=tolower(temp);i++;}else if(i>0){ //说明i中已经至少有一个字符word[i]='\0';return 0;}}return EOF;
int trie_articlewords[1000010][26]={0};
int num_articlewords[1000010]={0};
int pos_articlewords=0;void insert_articlewords(char *str)
{int p=0;for(int i=0; str[i]; i++){int n=str[i]-'a';if(trie_articlewords[p][n]==0)trie_articlewords[p][n]=++pos_articlewords;p=trie_articlewords[p][n];}num_articlewords[p]++; //在p结尾的单词数量+1 单词唯一
}int search_articlewords(char *str)
{int p=0;for(int i=0;str[i];i++){int n=str[i]-'a';
//        if(trie_articlewords[p][n]==0) return 0;//不存在p=trie_articlewords[p][n];}return num_articlewords[p];
}int getsentence(FILE *bfp,struct Sentence *item)
{int i=0,j=0;char temp,word[MAXWORD];while((temp=fgetc(bfp))!=EOF){if (i==0&&temp==' ')  continue;//注意:处理句首空格item->sentence[i++]=temp;if(isalpha(temp)){word[j]=tolower(temp);j++;}else if(j>0){ //说明i中已经至少有一个字符word[j]='\0';if (!search_stopwords(word))item->num=item->num+search_articlewords(word);j=0;memset(word, 0, sizeof(word));}if(temp=='.'||temp=='?'||temp=='!'){ //句子结束item->sentence[i+1]='\0';return 0;}}return EOF;

