


(2) 得到文件出现的字符种类数



#define HUFFMAN_HUFFMAN_H#include <stdio.h>int huffman_encode_file(FILE *in, FILE *out,FILE *out_Table);//step1: changed by yzhang for huffman statistics
int huffman_decode_file(FILE *in, FILE *out);
int huffman_encode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen);
int huffman_decode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **bufout,unsigned int *pbufoutlen);#endif


#include <stdio.h>
#include <stdlib.h>
#include <string.h>/* declarations to provide consistent linkage */
extern char *optarg;
extern int optind;
extern int opterr;int opterr = 1,     /* if error message should be printed */optind = 1,     /* index into parent argv vector */optopt,         /* character checked for validity */optreset;       /* reset getopt */
char    *optarg;        /* argument associated with option */#define BADCH   (int)'?'
#define BADARG  (int)':'
#define EMSG    ""/** getopt --*  Parse argc/argv argument vector.*/
getopt(int nargc, char * const *nargv, const char* ostr)
{static char *place = EMSG;      /* option letter processing */char *oli;              /* option letter list index */if (optreset || !*place)/*如果重置的参数optreset为1或当前扫描的字符为空,则重置*/{/* update scanning pointer */optreset = 0;if (optind >= nargc || *(place = nargv[optind]) != '-') {place = EMSG;return (EOF);}if (place[1] && *++place == '-') {  /* found "--" */++optind;place = EMSG;return (EOF);}}                   /* option letter okay? */if ((optopt = (int)*place++) == (int)':' ||!(oli = strchr(ostr, optopt))) {/** if the user didn't specify '-' as an option,* assume it means EOF.*/if (optopt == (int)'-')return (EOF);if (!*place)++optind;if (opterr && *ostr != ':')(void)fprintf(stderr,"%s: illegal option -- %c\n", __FILE__, optopt);return (BADCH);}if (*++oli != ':') {            /* don't need argument */optarg = NULL;if (!*place)++optind;}else {                  /* need an argument */if (*place)         /* no white space */optarg = place;else if (nargc <= ++optind) {   /* no arg */place = EMSG;if (*ostr == ':')return (BADARG);if (opterr)(void)fprintf(stderr,"%s: option requires an argument -- %c\n",__FILE__, optopt);return (BADCH);}else                /* white space */optarg = nargv[optind];place = EMSG;++optind;}return (optopt);            /* dump back option letter */


#include "huffman.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <assert.h>#ifdef WIN32
#include <malloc.h>
extern int getopt(int, char**, char*);
extern char* optarg;
#include <unistd.h>
#endifstatic int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);static void
version(FILE *out)
{fputs("huffcode 0.3\n""Copyright (C) 2003 Douglas Ryan Richardson""; Gauss Interprise, Inc\n",out);
}static void
usage(FILE* out)
{fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n""-i - input file (default is standard input)\n""-o - output file (default is standard output)\n""-d - decompress\n""-c - compress (default)\n""-m - read file into memory, compress, then write to file (not default)\n",// step1: by yzhang, for huffman statistics"-t - output huffman statistics\n",//step1:end by yzhangout);
main(int argc, char** argv)
{char memory = 0;char compress = 1;int opt;const char *file_in = NULL, *file_out = NULL;//step1:add by yzhang for huffman statisticsconst char *file_out_table = NULL;//end by yzhangFILE *in = stdin;FILE *out = stdout;//step1:add by yzhang for huffman statisticsFILE * outTable = NULL;//end by yzhang/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对{switch(opt){case 'i':file_in = optarg;break;case 'o':file_out = optarg;break;case 'c':compress = 1;break;case 'd':compress = 0;break;case 'h':usage(stdout);return 0;case 'v':version(stdout);return 0;case 'm':memory = 1;break;// by yzhang for huffman statisticscase 't':file_out_table = optarg;            break;//end by yzhangdefault:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in){in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out){out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}//by yzhang for huffman statisticsif(file_out_table){outTable = fopen(file_out_table, "w");if(!outTable){fprintf(stderr,"Can't open output file '%s': %s\n",file_out_table, strerror(errno));return 1;}}//end by yzhangif(memory){return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}if(compress)  //change by yzhanghuffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)elsehuffman_decode_file(in, out);if(in)fclose(in);if(out)fclose(out);if(outTable)fclose(outTable);return 0;
}static int
memory_encode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Encode the memory. */if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}static int
memory_decode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Decode the memory. */if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"#ifdef WIN32
#include <winsock2.h>
#include <malloc.h>
#define alloca _alloca
#include <netinet/in.h>
typedef struct huffman_node_tag
{unsigned char isLeaf;//是否为树叶节点unsigned long count;//该节点的频数struct huffman_node_tag *parent;//结构体的嵌套,指向父节点的结构指针union//联合体,若不是树叶节点则此处存储指向两孩子节点的结构体指针,若是树叶节点则存该节点的符号{struct{struct huffman_node_tag *zero, *one;};unsigned char symbol;};
} huffman_node;//结构体2,码字结构体,在树生成之后用来存编好的码字
typedef struct huffman_code_tag
{/* 该码字的长度。The length of this code in bits. */unsigned long numbits;/* The bits that make up this code. The firstbit is at position 0 in bits[0]. The secondbit is at position 1 in bits[0]. The eighthbit is at position 7 in bits[0]. The ninthbit is at position 0 in bits[1]. */unsigned char *bits;//组成该码字的比特数组,//例如某个码字为10010011 011100(从前往后表示从树叶到根的方向),//则bits[0]=11001001,bits[1]=00001110
} huffman_code;//结构体3,每个符号相关统计信息的结构体,共有256个这样的结构体,因为共有256种符号,
typedef struct huffman_statistics_result
{float freq[256];//该符号的频率unsigned long numbits[256];//该符号对应码字的长度unsigned char bits[256][100];//该符号对应的码字
}huffman_stat;/*huffman_stat *init_huffstatistics()
{   huffman_stat *p;int i;p = (huffman_stat*)malloc(sizeof(huffman_stat));p->freq = (float *)malloc(sizeof(float)*256 );p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256);for (i=0 ; i<256;i++)p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100); return p;
//end by yzhang//函数是程序处理的方法,对数据的操作方法
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{return numbits / 8 + (numbits % 8 ? 1 : 0);
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{return (bits[i / 8] >> i % 8) & 1;
}//先确定码字的第i位在第几个字节,再将该字节右移后与0000 0001按位与,得到第i位static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{unsigned long numbytes = numbytes_from_numbits(numbits);unsigned char *tmp =(unsigned char*)alloca(numbytes);//alloca函数是在栈(stack)上申请内存,用完立即释放unsigned long curbit;long curbyte = 0;memset(tmp, 0, numbytes);for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8;if(curbit > 0 && curbit % 8 == 0)++curbyte;tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);}memcpy(bits, tmp, numbytes);
}//作用是将码字倒序,因为bits是生成码树时从下到上的,而码字需要从上到下读。/** new_code builds a huffman_code from a leaf in a Huffman tree.*/
static huffman_code* new_code(const huffman_node* leaf)/*新建节点的函数*/
{/* Build the huffman code by walking up to* the root node and then reversing the bits,* since the Huffman code is calculated by* walking down the tree. */unsigned long numbits = 0;unsigned char* bits = NULL;huffman_code *p;while(leaf && leaf->parent){huffman_node *parent = leaf->parent;unsigned char cur_bit = (unsigned char)(numbits % 8);unsigned long cur_byte = numbits / 8;/* If we need another byte to hold the code,then allocate it. */if(cur_bit == 0){size_t newSize = cur_byte + 1;bits = (char*)realloc(bits, newSize);bits[newSize - 1] = 0; /* Initialize the new byte. */}/* If a one must be added then or it in. If a zero* must be added then do nothing, since the byte* was initialized to zero. */if(leaf == parent->one)bits[cur_byte] |= 1 << cur_bit;++numbits;leaf = parent;}if(bits)reverse_bits(bits, numbits);p = (huffman_code*)malloc(sizeof(huffman_code));p->numbits = numbits;p->bits = bits;return p;
}#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];static huffman_node*
new_leaf_node(unsigned char symbol)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 1;p->symbol = symbol;p->count = 0;p->parent = 0;return p;
}//生成一个树叶节点static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 0;p->count = count;p->zero = zero;p->one = one;p->parent = 0;return p;
}//生成一个非树叶节点static void
free_huffman_tree(huffman_node *subtree)
{if(subtree == NULL)return;if(!subtree->isLeaf){free_huffman_tree(subtree->zero);free_huffman_tree(subtree->one);}free(subtree);
}static void
free_code(huffman_code* p)
}static void
free_encoder(SymbolEncoder *pSE)
{unsigned long i;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*pSE)[i];if(p)free_code(p);}free(pSE);
}static void
init_frequencies(SymbolFrequencies *pSF)
{memset(*pSF, 0, sizeof(SymbolFrequencies));//将所有指针的地址初始化为0(NULL)
#if 0//#if 0~~#endif中的代码是被屏蔽掉的unsigned int i;for(i = 0; i < MAX_SYMBOLS; ++i){unsigned char uc = (unsigned char)i;(*pSF)[i] = new_leaf_node(uc);}
}typedef struct buf_cache_tag
{unsigned char *cache;unsigned int cache_len;unsigned int cache_cur;unsigned char **pbufout;unsigned int *pbufoutlen;
} buf_cache;static int init_cache(buf_cache* pc,unsigned int cache_size,unsigned char **pbufout,unsigned int *pbufoutlen)
{assert(pc && pbufout && pbufoutlen);if(!pbufout || !pbufoutlen)return 1;pc->cache = (unsigned char*)malloc(cache_size);pc->cache_len = cache_size;pc->cache_cur = 0;pc->pbufout = pbufout;*pbufout = NULL;pc->pbufoutlen = pbufoutlen;*pbufoutlen = 0;return pc->cache ? 0 : 1;
}static void free_cache(buf_cache* pc)
{assert(pc);if(pc->cache){free(pc->cache);pc->cache = NULL;}
}static int flush_cache(buf_cache* pc)
{assert(pc);if(pc->cache_cur > 0){unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;unsigned char* tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;pc->cache_cur = 0;}return 0;
}static int write_cache(buf_cache* pc,const void *to_write,unsigned int to_write_len)
{unsigned char* tmp;assert(pc && to_write);assert(pc->cache_len >= pc->cache_cur);/* If trying to write more than the cache will hold* flush the cache and allocate enough space immediately,* that is, don't use the cache. */if(to_write_len > pc->cache_len - pc->cache_cur){unsigned int newlen;flush_cache(pc);newlen = *pc->pbufoutlen + to_write_len;tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;}else{/* Write the data to the cache. */memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);pc->cache_cur += to_write_len;}return 0;
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{int c;unsigned int total_count = 0;//总信源符号数初始化为0/* 将所有信源符号的地址初始化为0 */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */while((c = fgetc(in)) != EOF){unsigned char uc = c;//当前扫描的符号为uc//若当前扫描的符号对应的结构体为空(该符号之前的扫描过程中没有出现过该符号),//则新建一个符号为uc的树叶结构体:if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;//每扫描一个符号,其个数加1++total_count;//每扫描一个符号,扫描过的总的符号数加1}return total_count;//返回值为文件总的符号个数
}static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,const unsigned char *bufin,unsigned int bufinlen)
{unsigned int i;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;++total_count;}return total_count;
}/** When used by qsort, SFComp sorts the array so that* the symbol with the lowest frequency is first. Any* NULL entries will be sorted to the end of the list.*/
static int
SFComp(const void *p1, const void *p2)
{const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;/* 所有为空的结构体(文件中从未出现的符号对应的结构体)排在最后 */if(hn1 == NULL && hn2 == NULL)return 0;if(hn1 == NULL)return 1;if(hn2 == NULL)return -1;if(hn1->count > hn2->count)return 1;else if(hn1->count < hn2->count)return -1;return 0;
}#if 1
static void
print_freqs(SymbolFrequencies * pSF)
{size_t i;for(i = 0; i < MAX_SYMBOLS; ++i){if((*pSF)[i])printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);elseprintf("NULL\n");}
#endif/** build_symbol_encoder builds a SymbolEncoder by walking* down to the leaves of the Huffman tree and then,* for each leaf, determines its code.*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{if(subtree == NULL)return;if(subtree->isLeaf)//当当前的节点为树叶节点时停止该条分支的遍历,因此是一个深度优先的遍历方法(*pSF)[subtree->symbol] = new_code(subtree);//else{//层层嵌套的函数,采用的是深度优先的遍历方法build_symbol_encoder(subtree->zero, pSF);build_symbol_encoder(subtree->one, pSF);}
}/** calculate_huffman_codes turns pSF into an array* with a single entry that is the root of the* huffman tree. The return value is a SymbolEncoder,* which is an array of huffman codes index by symbol value.*/
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;#if 1printf("BEFORE SORT\n");print_freqs(pSF);   //将排序前的符号及其对应的频数显示出来
#endif// void qsort(void *base,int nelem,int width,int (*fcmp)(const void *,const void *));//函数参数分别为:01待排序数组首地址,02数组中待排序元素数量,03各元素的占用空间大小,//04指向函数的指针(用于确定排序的顺序)(函数名代表函数的地址)//将符号按概率从小到大排序,小概率的符号在前:qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);   #if 1   printf("AFTER SORT\n");print_freqs(pSF);//将排序前的符号及其对应的频数显示出来
#endif//排序后文件中未出现过的符号在最末,因此通过下面循环能得到文件中出现过的符号的个数 for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n);/** Construct a Huffman tree. This code is based* on the algorithm given in Managing Gigabytes* by Ian Witten et al, 2nd edition, page 34.* Note that this implementation uses a simple* count instead of probability.*/for(i = 0; i < n - 1; ++i)//n个符号共需要比较n-1次{/* 生成树枝的过程是每次对频数最小的huffman_code操作的过程 */m1 = (*pSF)[0];//频率最小m2 = (*pSF)[1];//频率第二小/* 将频率最小的两个节点合成一个节点,新节点的左孩子为m1,右孩子为m2,频数为m1和m2频数之和,同时还需要将m1和m2的父亲指针指向新生成的节点 */(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);(*pSF)[1] = NULL;/* 每生成一个新节点重新排序一次 */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);}/* 根据生成的huffman树得到每个符号对应的码字 */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);//从根节点开始向下遍历,得到每个符号对应的码字return pSE;
}/** Write the huffman code table. The format is:* 4 byte code count in network byte order.* 4 byte number of bytes encoded*   (if you decode the data, you should get this number of bytes)* code1* ...* codeN, where N is the count read at the begginning of the file.* Each codeI has the following format:* 1 byte symbol, 1 byte code bit length, code bytes.* Each entry has numbytes_from_numbits code bytes.* The last byte of each code may have extra bits, if the number of* bits in the code is not a multiple of 8.*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{unsigned long i, count = 0;/* 得到码表中对应有码字的符号的个数,即文件中出现过的符号的种类数 */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);    //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D ,//因此big-endian作为network byte order,little-endian作为host byte order。//little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变if(fwrite(&i, sizeof(i), 1, out) != 1)return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* 将符号(0~256)写入输出文件(1字节) */fputc((unsigned char)i, out);/* 将符号对应的码字的长度写入输出文件(1字节) */fputc(p->numbits, out);/* 将符号对应的码字写入到输出文件中 */numbytes = numbytes_from_numbits(p->numbits);if(fwrite(p->bits, 1, numbytes, out) != numbytes)return 1;}}return 0;
}/** Allocates memory and sets *pbufout to point to it. The memory* contains the code table.*/
static int
write_code_table_to_memory(buf_cache *pc,SymbolEncoder *se,unsigned int symbol_count)
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);if(write_cache(pc, &i, sizeof(i)))return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(write_cache(pc, &symbol_count, sizeof(symbol_count)))return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* The value of i is < MAX_SYMBOLS (256), so it canbe stored in an unsigned char. */unsigned char uc = (unsigned char)i;/* Write the 1 byte symbol. */if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the 1 byte code bit length. */uc = (unsigned char)p->numbits;if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);if(write_cache(pc, p->bits, numbytes))return 1;}}return 0;
}/** read_code_table builds a Huffman tree from the code* in the in file. This function returns NULL on error.* The returned value should be freed with free_huffman_tree.*/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1){free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){int c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}symbol = (unsigned char)c;if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}numbits = (unsigned char)c;numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(fread(bytes, 1, numbytes, in) != numbytes){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}static int
memread(const unsigned char* buf,unsigned int buflen,unsigned int *pindex,void* bufout,unsigned int readlen)
{assert(buf && pindex && bufout);assert(buflen >= *pindex);if(buflen < *pindex)return 1;if(readlen + *pindex >= buflen)return 1;memcpy(bufout, buf + *pindex, readlen);*pindex += readlen;return 0;
}static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,unsigned int bufinlen,unsigned int *pindex,unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(memread(bufin, bufinlen, pindex, &count, sizeof(count))){free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))){free_huffman_tree(root);return NULL;}if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))){free_huffman_tree(root);return NULL;}numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(memread(bufin, bufinlen, pindex, bytes, numbytes)){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;int c;while((c = fgetc(in)) != EOF){unsigned char uc = (unsigned char)c;//当前扫描得到的符号huffman_code *code = (*se)[uc];//当前扫描符号对应的结构体指针unsigned long i;//将结构体中的码字写到输出文件中for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){fputc(curbyte, out);curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/if(curbit > 0)fputc(curbyte, out);return 0;
}static int
do_memory_encode(buf_cache *pc,const unsigned char* bufin,unsigned int bufinlen,SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;unsigned int i;for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){if(write_cache(pc, &curbyte, sizeof(curbyte)))return 1;curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}//step3:add by yzhang for huffman statistics
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{int i,count =0;for(i = 0; i < MAX_SYMBOLS; ++i){   if((*SF)[i]){st->freq[i]=(float)(*SF)[i]->count/total_count;count+=(*SF)[i]->count;}else {st->freq[i]= 0;}}if(count==total_count)return 1;elsereturn 0;
}int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{unsigned long i,j;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;st->numbits[i] = p->numbits;numbytes = numbytes_from_numbits(p->numbits);for (j=0;j<numbytes;j++)st->bits[i][j] = p->bits[j];}elsest->numbits[i] =0;}return 0;
}void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{int i,j;unsigned char c;fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");for(i = 0; i < MAX_SYMBOLS; ++i){   fprintf(out_Table,"%d\t   ",i);fprintf(out_Table,"%f\t   ",st->freq[i]);fprintf(out_Table,"%d\t    ",st->numbits[i]);if(st->numbits[i]){for(j = 0; j < st->numbits[i]; ++j){c =get_bit(st->bits[i], j);fprintf(out_Table,"%d",c);}}fprintf(out_Table,"\n");}
//end by yzhang
/** huffman_encode_file huffman encodes in to out.*/
huffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;//step2:add by yzhang for huffman statisticshuffman_stat hs;//文件统计信息//end by yzhang/* 第一次扫描:统计输入文件每个符号(0~255)的频率(0~1)。*//* Get the frequency of each symbol in the input file. */symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成//step3:add by yzhang for huffman statistics,...  get the frequency of each symbol huffST_getSymFrequencies(&sf,&hs,symbol_count);//end by yzhang/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];//step3:add by yzhang for huffman statistics... output the statistics to filehuffST_getcodeword(se, &hs);output_huffman_statistics(&hs,out_Table);//end by yzhang/* Scan the file again and, using the tablepreviously built, encode it into the output file. */rewind(in);rc = write_code_table(out, se, symbol_count);if(rc == 0)rc = do_file_encode(in, out, se);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);return rc;
huffman_decode_file(FILE *in, FILE *out)
{huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);if(!root)return 1;/* Decode the file. */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF){unsigned char byte = (unsigned char)c;unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree(root);return 0;
}#define CACHE_SIZE 1024int huffman_encode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;buf_cache cache;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))return 1;/* Get the frequency of each symbol in the input memory. */symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];/* Scan the memory again and, using the tablepreviously built, encode it into the output memory. */rc = write_code_table_to_memory(&cache, se, symbol_count);if(rc == 0)rc = do_memory_encode(&cache, bufin, bufinlen, se);/* Flush the cache. */flush_cache(&cache);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);free_cache(&cache);return rc;
}int huffman_decode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{huffman_node *root, *p;unsigned int data_count;unsigned int i = 0;unsigned char *buf;unsigned int bufcur = 0;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;/* Read the Huffman code table. */root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);if(!root)return 1;buf = (unsigned char*)malloc(data_count);/* Decode the memory. */p = root;for(; i < bufinlen && data_count > 0; ++i) {unsigned char byte = bufin[i];unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){buf[bufcur++] = p->symbol;p = root;--data_count;}}}free_huffman_tree(root);*pbufout = buf;*pbufoutlen = bufcur;return 0;



实验结果的分析 :



  1. 数据压缩 实验三 Huffman编解码算法实现与压缩效率分析

    实验目的 掌握Huffman编解码实现的数据结构和实现框架, 进一步熟练使用C编程语言, 并完成压缩效率的分析. 实验原理 1.本实验中Huffman编码算法 (1)将文件以ASCII字符流的形式读入 ...

  2. python Huffman编码及解码

    Huffman编码及解码 # coding:utf-8#Tree-Node Type class Node:def __init__(self,freq):self.left = Noneself.r ...

  3. 北理乐学计算机实验三,北理工大学计算机实验三-字符编码与信息交换.docx

    北理工大学计算机实验三-字符编码与信息交换.docx 实验报告实验名称学号 姓名 班级 实验时间 年 月 日实验报告表3-1 西文字符显示过程编码记录表输入字符ASCII码(十进制数)内存信息(二进制 ...

  4. Huffman编码与解码

    Huffman编码与解码 // @author: Folivora Li // @copyright Folivora Li/* 4.Huffman编码与解码 (必做)(Huffman编码.二叉树) ...

  5. 实验三—Huffman编解码

    一.实验原理 1.Huffman编码的步骤: (1)首先将所有字符发生的概率从小到大进行排序: (2)将最小的两个概率进行两两一合并,之后继续找最小的两个概率进行合并包括前面已经合并的和数: (3)一 ...

  6. 实验三 Huffman编解码算法实现与压缩效率分析

    一.Huffman编解码原理 1. Huffman编码 对原始文件进行Huffman编码,首先需要解决以下几点问题: 文件符号的概率分布情况是怎样的? Huffman树是如何建立的? 建立起Huffm ...

  7. 数据压缩原理 实验三 Huffman编解码算法实现与压缩效率分析

    实验原理 Huffman编码是一种无失真编码方式,是一种可变长编码,它将出现概率大的信源符号短编码,出现概率小的信源符号长编码. 编码步骤: ①将文件以ASCII字符流的形式读入,统计每个符号的发生概 ...

  8. 数据压缩实验三--Huffman编解码及压缩率的比较

    一,Huffman码 1 Huffman 编码 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman编码是可变字长编码(VLC)的一种. Huffman 编码基于信源 ...

  9. DS二叉树——Huffman编码与解码(不含代码框架)

    题目描述 1.问题描述 给定n个字符及其对应的权值,构造Huffman树,并进行huffman编码和译(解)码. 构造Huffman树时,要求左子树根的权值小于.等于右子树根的权值. 进行Huffma ...


  1. AI一分钟 | 阿里云放大招要揽1000名AI人才,川普AI守国论遭遇54名科学家反对
  2. 同时给两个变量值赋值
  3. 用python处理excel数据的优势-python数据分析相对于bi和excel的优势是什么?
  4. Vue.js 状态过渡
  5. Java常考面试题(一)
  6. 数据表的查看 mysql
  7. 关于SQL Server 2005 的自动远程数据库备份
  8. python释放变量内存_看完2019年阿里巴巴Python面试题详解,月薪3万不是梦
  9. C# 通过PostMessage完成UI的更新
  10. JAVA实现显示指定类型的文件的例子
  11. 职工信息管理系统(c语言实现)
  12. Kali下安装 dvwa 的完整详细教程
  13. 电子面单打印模板规格汇总-快递鸟
  14. 遥感学习笔记(四)——遥感数据分类
  15. idea修改主题和更换背景
  16. 星巴克在东京开设四层楼的全沉浸式优质咖啡体验门店
  17. 台式计算机如何自动开关机,电脑怎么设置自动关机时间 电脑自动开机时间怎么设置...
  18. 最长等差数列 leetcode java_51nod1055 最长等差数列
  19. 一学就会的 WordPress 实战课
  20. deepin v20显卡问题wifi网速慢cpu高频率发热(2021-1-23更新)


  1. STM32个人笔记-CAN总线通讯
  2. 小论快充(原理、协议、比较)
  3. 电影垂直社交观影和亲友们在家一起看电影吧
  4. MAC终端输入换行问题
  5. 全球首辆飞行汽车将在欧洲上路行驶;全球十大电视制造商明年将购买2亿块液晶电视面板 | 美通企业日报...
  6. 《剑指Offer》题解汇总索引表(leetcode)
  7. 实用五步法教会你指标体系的设计与加工
  8. 凸优化学习-(二十九)有约束优化算法——增广拉格朗日法、交替方向乘子法(ADMM)
  9. java poi 模板填数据库,java使用POI读取excel模版并向固定表格里填写数据详解
  10. C#分割字符串。歌词