一、实验原理

Huffman编码实现的数据结构

Huffman编码为可变长编码,若各码字长度按照所对应符号出现概率的大小逆序排列,则其平均长度最小。

编码步骤:
1、将信源符号按照出现概率由大到小的顺序排列;
2、将两个最小概率组合相加,并继续这一步骤,时钟将较高的概率分支放在上部,直到概率到达1为止;
3、对每对组合的上边一个指定为1,下边一个指定为0(或相反指定);
4、画出由概率1处到每个信源符号概率的路径,顺序记录下沿路径的1和0,所得即为该符号的Huffman码字。
在程序中具体实现上述方法的步骤如下:
1、读入待编码的文件(格式不限可能是文档、音频等);
2、扫描文件,统计各个字符出现的概率并建立相应的树叶节点;
3、建立Huffman树,
(1)按字符概率由小到大将对应结点排序
(2) 得到文件出现的字符种类数
(3)构建霍夫曼树:先置两个初始树叶节点,再构造俩个树叶节点的父节点(合并概率),再将一节点置空,重新排序。
(4)对码树编码:函数中对最后排好序的概率遍历,判断是否为树叶节点,若是则构造huffman码结构否则递归函数直到达到树叶节点。
4、将码表及其他必要信息写入输出文件
5、第二次扫描:对源文件进行编码并输出

二、部分代码及注释:

huffman.h

#ifndef HUFFMAN_HUFFMAN_H
#define HUFFMAN_HUFFMAN_H#include <stdio.h>int huffman_encode_file(FILE *in, FILE *out,FILE *out_Table);//step1: changed by yzhang for huffman statistics
int huffman_decode_file(FILE *in, FILE *out);
int huffman_encode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen);
int huffman_decode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **bufout,unsigned int *pbufoutlen);#endif

getopt.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>/* declarations to provide consistent linkage */
extern char *optarg;
extern int optind;
extern int opterr;int opterr = 1,     /* if error message should be printed */optind = 1,     /* index into parent argv vector */optopt,         /* character checked for validity */optreset;       /* reset getopt */
char    *optarg;        /* argument associated with option */#define BADCH   (int)'?'
#define BADARG  (int)':'
#define EMSG    ""/** getopt --*  Parse argc/argv argument vector.*/
int
getopt(int nargc, char * const *nargv, const char* ostr)
{static char *place = EMSG;      /* option letter processing */char *oli;              /* option letter list index */if (optreset || !*place)/*如果重置的参数optreset为1或当前扫描的字符为空,则重置*/{/* update scanning pointer */optreset = 0;if (optind >= nargc || *(place = nargv[optind]) != '-') {place = EMSG;return (EOF);}if (place[1] && *++place == '-') {  /* found "--" */++optind;place = EMSG;return (EOF);}}                   /* option letter okay? */if ((optopt = (int)*place++) == (int)':' ||!(oli = strchr(ostr, optopt))) {/** if the user didn't specify '-' as an option,* assume it means EOF.*/if (optopt == (int)'-')return (EOF);if (!*place)++optind;if (opterr && *ostr != ':')(void)fprintf(stderr,"%s: illegal option -- %c\n", __FILE__, optopt);return (BADCH);}if (*++oli != ':') {            /* don't need argument */optarg = NULL;if (!*place)++optind;}else {                  /* need an argument */if (*place)         /* no white space */optarg = place;else if (nargc <= ++optind) {   /* no arg */place = EMSG;if (*ostr == ':')return (BADARG);if (opterr)(void)fprintf(stderr,"%s: option requires an argument -- %c\n",__FILE__, optopt);return (BADCH);}else                /* white space */optarg = nargv[optind];place = EMSG;++optind;}return (optopt);            /* dump back option letter */
}

huffcode.c

#include "huffman.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <assert.h>#ifdef WIN32
#include <malloc.h>
extern int getopt(int, char**, char*);
extern char* optarg;
#else
#include <unistd.h>
#endifstatic int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);static void
version(FILE *out)
{fputs("huffcode 0.3\n""Copyright (C) 2003 Douglas Ryan Richardson""; Gauss Interprise, Inc\n",out);
}static void
usage(FILE* out)
{fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n""-i - input file (default is standard input)\n""-o - output file (default is standard output)\n""-d - decompress\n""-c - compress (default)\n""-m - read file into memory, compress, then write to file (not default)\n",// step1: by yzhang, for huffman statistics"-t - output huffman statistics\n",//step1:end by yzhangout);
}int
main(int argc, char** argv)
{char memory = 0;char compress = 1;int opt;const char *file_in = NULL, *file_out = NULL;//step1:add by yzhang for huffman statisticsconst char *file_out_table = NULL;//end by yzhangFILE *in = stdin;FILE *out = stdout;//step1:add by yzhang for huffman statisticsFILE * outTable = NULL;//end by yzhang/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对{switch(opt){case 'i':file_in = optarg;break;case 'o':file_out = optarg;break;case 'c':compress = 1;break;case 'd':compress = 0;break;case 'h':usage(stdout);return 0;case 'v':version(stdout);return 0;case 'm':memory = 1;break;// by yzhang for huffman statisticscase 't':file_out_table = optarg;            break;//end by yzhangdefault:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in){in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out){out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}//by yzhang for huffman statisticsif(file_out_table){outTable = fopen(file_out_table, "w");if(!outTable){fprintf(stderr,"Can't open output file '%s': %s\n",file_out_table, strerror(errno));return 1;}}//end by yzhangif(memory){return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}if(compress)  //change by yzhanghuffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)elsehuffman_decode_file(in, out);if(in)fclose(in);if(out)fclose(out);if(outTable)fclose(outTable);return 0;
}static int
memory_encode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Encode the memory. */if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}static int
memory_decode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Decode the memory. */if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}

huffman.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"#ifdef WIN32
#include <winsock2.h>
#include <malloc.h>
#define alloca _alloca
#else
#include <netinet/in.h>
#endif//结构体,看数据存储的结构:
//结构体1,霍夫曼树节点的结构体:
typedef struct huffman_node_tag
{unsigned char isLeaf;//是否为树叶节点unsigned long count;//该节点的频数struct huffman_node_tag *parent;//结构体的嵌套,指向父节点的结构指针union//联合体,若不是树叶节点则此处存储指向两孩子节点的结构体指针,若是树叶节点则存该节点的符号{struct{struct huffman_node_tag *zero, *one;};unsigned char symbol;};
} huffman_node;//结构体2,码字结构体,在树生成之后用来存编好的码字
typedef struct huffman_code_tag
{/* 该码字的长度。The length of this code in bits. */unsigned long numbits;/* The bits that make up this code. The firstbit is at position 0 in bits[0]. The secondbit is at position 1 in bits[0]. The eighthbit is at position 7 in bits[0]. The ninthbit is at position 0 in bits[1]. */unsigned char *bits;//组成该码字的比特数组,//例如某个码字为10010011 011100(从前往后表示从树叶到根的方向),//则bits[0]=11001001,bits[1]=00001110
} huffman_code;//结构体3,每个符号相关统计信息的结构体,共有256个这样的结构体,因为共有256种符号,
//每个结构体中存了该符号的频率,该符号对应码字的长度,该符号对应的码字
typedef struct huffman_statistics_result
{float freq[256];//该符号的频率unsigned long numbits[256];//该符号对应码字的长度unsigned char bits[256][100];//该符号对应的码字
}huffman_stat;/*huffman_stat *init_huffstatistics()
{   huffman_stat *p;int i;p = (huffman_stat*)malloc(sizeof(huffman_stat));p->freq = (float *)malloc(sizeof(float)*256 );p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256);for (i=0 ; i<256;i++)p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100); return p;
}*/
//end by yzhang//函数是程序处理的方法,对数据的操作方法
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{return numbits / 8 + (numbits % 8 ? 1 : 0);
}//由比特位数判断需要几字节来存储该码字/*得到码字的第i位,其值存在返回值的最低位的比特*/
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{return (bits[i / 8] >> i % 8) & 1;
}//先确定码字的第i位在第几个字节,再将该字节右移后与0000 0001按位与,得到第i位static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{unsigned long numbytes = numbytes_from_numbits(numbits);unsigned char *tmp =(unsigned char*)alloca(numbytes);//alloca函数是在栈(stack)上申请内存,用完立即释放unsigned long curbit;long curbyte = 0;memset(tmp, 0, numbytes);for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8;if(curbit > 0 && curbit % 8 == 0)++curbyte;tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);}memcpy(bits, tmp, numbytes);
}//作用是将码字倒序,因为bits是生成码树时从下到上的,而码字需要从上到下读。/** new_code builds a huffman_code from a leaf in a Huffman tree.*/
static huffman_code* new_code(const huffman_node* leaf)/*新建节点的函数*/
{/* Build the huffman code by walking up to* the root node and then reversing the bits,* since the Huffman code is calculated by* walking down the tree. */unsigned long numbits = 0;unsigned char* bits = NULL;huffman_code *p;while(leaf && leaf->parent){huffman_node *parent = leaf->parent;unsigned char cur_bit = (unsigned char)(numbits % 8);unsigned long cur_byte = numbits / 8;/* If we need another byte to hold the code,then allocate it. */if(cur_bit == 0){size_t newSize = cur_byte + 1;bits = (char*)realloc(bits, newSize);bits[newSize - 1] = 0; /* Initialize the new byte. */}/* If a one must be added then or it in. If a zero* must be added then do nothing, since the byte* was initialized to zero. */if(leaf == parent->one)bits[cur_byte] |= 1 << cur_bit;++numbits;leaf = parent;}if(bits)reverse_bits(bits, numbits);p = (huffman_code*)malloc(sizeof(huffman_code));p->numbits = numbits;p->bits = bits;return p;
}#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];static huffman_node*
new_leaf_node(unsigned char symbol)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 1;p->symbol = symbol;p->count = 0;p->parent = 0;return p;
}//生成一个树叶节点static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 0;p->count = count;p->zero = zero;p->one = one;p->parent = 0;return p;
}//生成一个非树叶节点static void
free_huffman_tree(huffman_node *subtree)
{if(subtree == NULL)return;if(!subtree->isLeaf){free_huffman_tree(subtree->zero);free_huffman_tree(subtree->one);}free(subtree);
}static void
free_code(huffman_code* p)
{free(p->bits);free(p);
}static void
free_encoder(SymbolEncoder *pSE)
{unsigned long i;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*pSE)[i];if(p)free_code(p);}free(pSE);
}static void
init_frequencies(SymbolFrequencies *pSF)
{memset(*pSF, 0, sizeof(SymbolFrequencies));//将所有指针的地址初始化为0(NULL)
#if 0//#if 0~~#endif中的代码是被屏蔽掉的unsigned int i;for(i = 0; i < MAX_SYMBOLS; ++i){unsigned char uc = (unsigned char)i;(*pSF)[i] = new_leaf_node(uc);}
#endif
}typedef struct buf_cache_tag
{unsigned char *cache;unsigned int cache_len;unsigned int cache_cur;unsigned char **pbufout;unsigned int *pbufoutlen;
} buf_cache;static int init_cache(buf_cache* pc,unsigned int cache_size,unsigned char **pbufout,unsigned int *pbufoutlen)
{assert(pc && pbufout && pbufoutlen);if(!pbufout || !pbufoutlen)return 1;pc->cache = (unsigned char*)malloc(cache_size);pc->cache_len = cache_size;pc->cache_cur = 0;pc->pbufout = pbufout;*pbufout = NULL;pc->pbufoutlen = pbufoutlen;*pbufoutlen = 0;return pc->cache ? 0 : 1;
}static void free_cache(buf_cache* pc)
{assert(pc);if(pc->cache){free(pc->cache);pc->cache = NULL;}
}static int flush_cache(buf_cache* pc)
{assert(pc);if(pc->cache_cur > 0){unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;unsigned char* tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;pc->cache_cur = 0;}return 0;
}static int write_cache(buf_cache* pc,const void *to_write,unsigned int to_write_len)
{unsigned char* tmp;assert(pc && to_write);assert(pc->cache_len >= pc->cache_cur);/* If trying to write more than the cache will hold* flush the cache and allocate enough space immediately,* that is, don't use the cache. */if(to_write_len > pc->cache_len - pc->cache_cur){unsigned int newlen;flush_cache(pc);newlen = *pc->pbufoutlen + to_write_len;tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;}else{/* Write the data to the cache. */memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);pc->cache_cur += to_write_len;}return 0;
}//第一步:第一次扫描文件,建立256个树叶节点,计算每个符号对应的频数
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{int c;unsigned int total_count = 0;//总信源符号数初始化为0/* 将所有信源符号的地址初始化为0 */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */while((c = fgetc(in)) != EOF){unsigned char uc = c;//当前扫描的符号为uc//若当前扫描的符号对应的结构体为空(该符号之前的扫描过程中没有出现过该符号),//则新建一个符号为uc的树叶结构体:if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;//每扫描一个符号,其个数加1++total_count;//每扫描一个符号,扫描过的总的符号数加1}return total_count;//返回值为文件总的符号个数
}static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,const unsigned char *bufin,unsigned int bufinlen)
{unsigned int i;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;++total_count;}return total_count;
}/** When used by qsort, SFComp sorts the array so that* the symbol with the lowest frequency is first. Any* NULL entries will be sorted to the end of the list.*/
static int
SFComp(const void *p1, const void *p2)
{const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;/* 所有为空的结构体(文件中从未出现的符号对应的结构体)排在最后 */if(hn1 == NULL && hn2 == NULL)return 0;if(hn1 == NULL)return 1;if(hn2 == NULL)return -1;if(hn1->count > hn2->count)return 1;else if(hn1->count < hn2->count)return -1;return 0;
}#if 1
static void
print_freqs(SymbolFrequencies * pSF)
{size_t i;for(i = 0; i < MAX_SYMBOLS; ++i){if((*pSF)[i])printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);elseprintf("NULL\n");}
}//将符号及其对应的频率显示出来
#endif/** build_symbol_encoder builds a SymbolEncoder by walking* down to the leaves of the Huffman tree and then,* for each leaf, determines its code.*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{if(subtree == NULL)return;if(subtree->isLeaf)//当当前的节点为树叶节点时停止该条分支的遍历,因此是一个深度优先的遍历方法(*pSF)[subtree->symbol] = new_code(subtree);//else{//层层嵌套的函数,采用的是深度优先的遍历方法build_symbol_encoder(subtree->zero, pSF);build_symbol_encoder(subtree->one, pSF);}
}/** calculate_huffman_codes turns pSF into an array* with a single entry that is the root of the* huffman tree. The return value is a SymbolEncoder,* which is an array of huffman codes index by symbol value.*/
//第二步:生成huffman树:
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;#if 1printf("BEFORE SORT\n");print_freqs(pSF);   //将排序前的符号及其对应的频数显示出来
#endif// void qsort(void *base,int nelem,int width,int (*fcmp)(const void *,const void *));//函数参数分别为:01待排序数组首地址,02数组中待排序元素数量,03各元素的占用空间大小,//04指向函数的指针(用于确定排序的顺序)(函数名代表函数的地址)//将符号按概率从小到大排序,小概率的符号在前:qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);   #if 1   printf("AFTER SORT\n");print_freqs(pSF);//将排序前的符号及其对应的频数显示出来
#endif//排序后文件中未出现过的符号在最末,因此通过下面循环能得到文件中出现过的符号的个数 for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n);/** Construct a Huffman tree. This code is based* on the algorithm given in Managing Gigabytes* by Ian Witten et al, 2nd edition, page 34.* Note that this implementation uses a simple* count instead of probability.*/for(i = 0; i < n - 1; ++i)//n个符号共需要比较n-1次{/* 生成树枝的过程是每次对频数最小的huffman_code操作的过程 */m1 = (*pSF)[0];//频率最小m2 = (*pSF)[1];//频率第二小/* 将频率最小的两个节点合成一个节点,新节点的左孩子为m1,右孩子为m2,频数为m1和m2频数之和,同时还需要将m1和m2的父亲指针指向新生成的节点 */(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);(*pSF)[1] = NULL;/* 每生成一个新节点重新排序一次 */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);}/* 根据生成的huffman树得到每个符号对应的码字 */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);//从根节点开始向下遍历,得到每个符号对应的码字return pSE;
}/** Write the huffman code table. The format is:* 4 byte code count in network byte order.* 4 byte number of bytes encoded*   (if you decode the data, you should get this number of bytes)* code1* ...* codeN, where N is the count read at the begginning of the file.* Each codeI has the following format:* 1 byte symbol, 1 byte code bit length, code bytes.* Each entry has numbytes_from_numbits code bytes.* The last byte of each code may have extra bits, if the number of* bits in the code is not a multiple of 8.*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{unsigned long i, count = 0;/* 得到码表中对应有码字的符号的个数,即文件中出现过的符号的种类数 */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);    //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D ,//因此big-endian作为network byte order,little-endian作为host byte order。//little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变if(fwrite(&i, sizeof(i), 1, out) != 1)return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* 将符号(0~256)写入输出文件(1字节) */fputc((unsigned char)i, out);/* 将符号对应的码字的长度写入输出文件(1字节) */fputc(p->numbits, out);/* 将符号对应的码字写入到输出文件中 */numbytes = numbytes_from_numbits(p->numbits);if(fwrite(p->bits, 1, numbytes, out) != numbytes)return 1;}}return 0;
}/** Allocates memory and sets *pbufout to point to it. The memory* contains the code table.*/
static int
write_code_table_to_memory(buf_cache *pc,SymbolEncoder *se,unsigned int symbol_count)
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);if(write_cache(pc, &i, sizeof(i)))return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(write_cache(pc, &symbol_count, sizeof(symbol_count)))return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* The value of i is < MAX_SYMBOLS (256), so it canbe stored in an unsigned char. */unsigned char uc = (unsigned char)i;/* Write the 1 byte symbol. */if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the 1 byte code bit length. */uc = (unsigned char)p->numbits;if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);if(write_cache(pc, p->bits, numbytes))return 1;}}return 0;
}/** read_code_table builds a Huffman tree from the code* in the in file. This function returns NULL on error.* The returned value should be freed with free_huffman_tree.*/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1){free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){int c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}symbol = (unsigned char)c;if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}numbits = (unsigned char)c;numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(fread(bytes, 1, numbytes, in) != numbytes){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}static int
memread(const unsigned char* buf,unsigned int buflen,unsigned int *pindex,void* bufout,unsigned int readlen)
{assert(buf && pindex && bufout);assert(buflen >= *pindex);if(buflen < *pindex)return 1;if(readlen + *pindex >= buflen)return 1;memcpy(bufout, buf + *pindex, readlen);*pindex += readlen;return 0;
}static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,unsigned int bufinlen,unsigned int *pindex,unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(memread(bufin, bufinlen, pindex, &count, sizeof(count))){free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))){free_huffman_tree(root);return NULL;}if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))){free_huffman_tree(root);return NULL;}numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(memread(bufin, bufinlen, pindex, bytes, numbytes)){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;int c;while((c = fgetc(in)) != EOF){unsigned char uc = (unsigned char)c;//当前扫描得到的符号huffman_code *code = (*se)[uc];//当前扫描符号对应的结构体指针unsigned long i;//将结构体中的码字写到输出文件中for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){fputc(curbyte, out);curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/if(curbit > 0)fputc(curbyte, out);return 0;
}static int
do_memory_encode(buf_cache *pc,const unsigned char* bufin,unsigned int bufinlen,SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;unsigned int i;for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){if(write_cache(pc, &curbyte, sizeof(curbyte)))return 1;curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}//step3:add by yzhang for huffman statistics
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{int i,count =0;for(i = 0; i < MAX_SYMBOLS; ++i){   if((*SF)[i]){st->freq[i]=(float)(*SF)[i]->count/total_count;count+=(*SF)[i]->count;}else {st->freq[i]= 0;}}if(count==total_count)return 1;elsereturn 0;
}int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{unsigned long i,j;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;st->numbits[i] = p->numbits;numbytes = numbytes_from_numbits(p->numbits);for (j=0;j<numbytes;j++)st->bits[i][j] = p->bits[j];}elsest->numbits[i] =0;}return 0;
}void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{int i,j;unsigned char c;fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");for(i = 0; i < MAX_SYMBOLS; ++i){   fprintf(out_Table,"%d\t   ",i);fprintf(out_Table,"%f\t   ",st->freq[i]);fprintf(out_Table,"%d\t    ",st->numbits[i]);if(st->numbits[i]){for(j = 0; j < st->numbits[i]; ++j){c =get_bit(st->bits[i], j);fprintf(out_Table,"%d",c);}}fprintf(out_Table,"\n");}
}
//end by yzhang
/** huffman_encode_file huffman encodes in to out.*/
int
huffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;//step2:add by yzhang for huffman statisticshuffman_stat hs;//文件统计信息//end by yzhang/* 第一次扫描:统计输入文件每个符号(0~255)的频率(0~1)。*//* Get the frequency of each symbol in the input file. */symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成//step3:add by yzhang for huffman statistics,...  get the frequency of each symbol huffST_getSymFrequencies(&sf,&hs,symbol_count);//end by yzhang/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];//step3:add by yzhang for huffman statistics... output the statistics to filehuffST_getcodeword(se, &hs);output_huffman_statistics(&hs,out_Table);//end by yzhang/* Scan the file again and, using the tablepreviously built, encode it into the output file. */rewind(in);rc = write_code_table(out, se, symbol_count);if(rc == 0)rc = do_file_encode(in, out, se);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);return rc;
}int
huffman_decode_file(FILE *in, FILE *out)
{huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);if(!root)return 1;/* Decode the file. */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF){unsigned char byte = (unsigned char)c;unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree(root);return 0;
}#define CACHE_SIZE 1024int huffman_encode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;buf_cache cache;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))return 1;/* Get the frequency of each symbol in the input memory. */symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];/* Scan the memory again and, using the tablepreviously built, encode it into the output memory. */rc = write_code_table_to_memory(&cache, se, symbol_count);if(rc == 0)rc = do_memory_encode(&cache, bufin, bufinlen, se);/* Flush the cache. */flush_cache(&cache);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);free_cache(&cache);return rc;
}int huffman_decode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{huffman_node *root, *p;unsigned int data_count;unsigned int i = 0;unsigned char *buf;unsigned int bufcur = 0;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;/* Read the Huffman code table. */root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);if(!root)return 1;buf = (unsigned char*)malloc(data_count);/* Decode the memory. */p = root;for(; i < bufinlen && data_count > 0; ++i) {unsigned char byte = bufin[i];unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){buf[bufcur++] = p->symbol;p = root;--data_count;}}}free_huffman_tree(root);*pbufout = buf;*pbufoutlen = bufcur;return 0;
}

三、实验结果:

生成的10种类型文件符号概率分布图:

实验结果的分析 :

在无失真信源编码定理中说明了,对于二进制码信源符号,平均码长的下界为信源熵。当信源符号接近等概分布时,信源熵最大,压缩比最低。即当文件的信源符号概率分布越不均匀,通过霍夫曼编码得到的编码效率越高。实验中对于xlsx文件的压缩时甚至出现了压缩比要大于一的情况,这是因为文件中的信源符号分布特别均匀再加上码表的传输使得整个文件经过huffman编码后反而增加了数据比特数。

实验三Huffman编码与解码相关推荐

  1. 数据压缩 实验三 Huffman编解码算法实现与压缩效率分析

    实验目的 掌握Huffman编解码实现的数据结构和实现框架, 进一步熟练使用C编程语言, 并完成压缩效率的分析. 实验原理 1.本实验中Huffman编码算法 (1)将文件以ASCII字符流的形式读入 ...

  2. python Huffman编码及解码

    Huffman编码及解码 # coding:utf-8#Tree-Node Type class Node:def __init__(self,freq):self.left = Noneself.r ...

  3. 北理乐学计算机实验三,北理工大学计算机实验三-字符编码与信息交换.docx

    北理工大学计算机实验三-字符编码与信息交换.docx 实验报告实验名称学号 姓名 班级 实验时间 年 月 日实验报告表3-1 西文字符显示过程编码记录表输入字符ASCII码(十进制数)内存信息(二进制 ...

  4. Huffman编码与解码

    Huffman编码与解码 // @author: Folivora Li // @copyright Folivora Li/* 4.Huffman编码与解码 (必做)(Huffman编码.二叉树) ...

  5. 实验三—Huffman编解码

    一.实验原理 1.Huffman编码的步骤: (1)首先将所有字符发生的概率从小到大进行排序: (2)将最小的两个概率进行两两一合并,之后继续找最小的两个概率进行合并包括前面已经合并的和数: (3)一 ...

  6. 实验三 Huffman编解码算法实现与压缩效率分析

    一.Huffman编解码原理 1. Huffman编码 对原始文件进行Huffman编码,首先需要解决以下几点问题: 文件符号的概率分布情况是怎样的? Huffman树是如何建立的? 建立起Huffm ...

  7. 数据压缩原理 实验三 Huffman编解码算法实现与压缩效率分析

    实验原理 Huffman编码是一种无失真编码方式,是一种可变长编码,它将出现概率大的信源符号短编码,出现概率小的信源符号长编码. 编码步骤: ①将文件以ASCII字符流的形式读入,统计每个符号的发生概 ...

  8. 数据压缩实验三--Huffman编解码及压缩率的比较

    一,Huffman码 1 Huffman 编码 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman编码是可变字长编码(VLC)的一种. Huffman 编码基于信源 ...

  9. DS二叉树——Huffman编码与解码(不含代码框架)

    题目描述 1.问题描述 给定n个字符及其对应的权值,构造Huffman树,并进行huffman编码和译(解)码. 构造Huffman树时,要求左子树根的权值小于.等于右子树根的权值. 进行Huffma ...

最新文章

  1. AI一分钟 | 阿里云放大招要揽1000名AI人才,川普AI守国论遭遇54名科学家反对
  2. 同时给两个变量值赋值
  3. 用python处理excel数据的优势-python数据分析相对于bi和excel的优势是什么?
  4. Vue.js 状态过渡
  5. Java常考面试题(一)
  6. 数据表的查看 mysql
  7. 关于SQL Server 2005 的自动远程数据库备份
  8. python释放变量内存_看完2019年阿里巴巴Python面试题详解,月薪3万不是梦
  9. C# 通过PostMessage完成UI的更新
  10. JAVA实现显示指定类型的文件的例子
  11. 职工信息管理系统(c语言实现)
  12. Kali下安装 dvwa 的完整详细教程
  13. 电子面单打印模板规格汇总-快递鸟
  14. 遥感学习笔记(四)——遥感数据分类
  15. idea修改主题和更换背景
  16. 星巴克在东京开设四层楼的全沉浸式优质咖啡体验门店
  17. 台式计算机如何自动开关机,电脑怎么设置自动关机时间 电脑自动开机时间怎么设置...
  18. 最长等差数列 leetcode java_51nod1055 最长等差数列
  19. 一学就会的 WordPress 实战课
  20. deepin v20显卡问题wifi网速慢cpu高频率发热(2021-1-23更新)

热门文章

  1. STM32个人笔记-CAN总线通讯
  2. 小论快充(原理、协议、比较)
  3. 电影垂直社交观影和亲友们在家一起看电影吧
  4. MAC终端输入换行问题
  5. 全球首辆飞行汽车将在欧洲上路行驶;全球十大电视制造商明年将购买2亿块液晶电视面板 | 美通企业日报...
  6. 《剑指Offer》题解汇总索引表(leetcode)
  7. 实用五步法教会你指标体系的设计与加工
  8. 凸优化学习-(二十九)有约束优化算法——增广拉格朗日法、交替方向乘子法(ADMM)
  9. java poi 模板填数据库,java使用POI读取excel模版并向固定表格里填写数据详解
  10. C#分割字符串。歌词