一、Huffman编解码原理

1. Huffman编码

对原始文件进行Huffman编码，首先需要解决以下几点问题：

文件符号的概率分布情况是怎样的？
Huffman树是如何建立的？
建立起Huffman树后，又是怎样读出符号对应码字的？

这三个问题在程序中的实现思路如下图：

将待编码文件里的数据参照已形成的Huffman码表一一进行转换，就可以得到编码后的文件了。

2. Huffman解码

Huffman解码是查表＋翻译的过程。读取随接收文件传来的码表后，再逐位读取文件实际数据，对照码表进行翻译即可。

二、程序实现

流程中最关键的对Huffman树的操作在程序中主要通过两个结构体实现：Huffman_node和Huffman_code。
建立的二叉树上每个节点都以Huffman_node类型存在。节点之间的主要关系有父子、兄弟，Huffman_node中定义了指向父节点的指针*parent和指向孩子的指针*zero, *one来表述节点与节点之间的关系。除此之外，还有节点本身的属性：isLeaf、count、symbol。
而编码码字定义为了Huffman_code，本身属性包括码字占用的比特数和码字本身。
具体程序如下，部分理解在注释中给出。

Huffcode.c

/**  huffcode - Encode/Decode files using Huffman encoding.*  http://huffman.sourceforge.net*  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc**  This library is free software; you can redistribute it and/or*  modify it under the terms of the GNU Lesser General Public*  License as published by the Free Software Foundation; either*  version 2.1 of the License, or (at your option) any later version.**  This library is distributed in the hope that it will be useful,*  but WITHOUT ANY WARRANTY; without even the implied warranty of*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU*  Lesser General Public License for more details.**  You should have received a copy of the GNU Lesser General Public*  License along with this library; if not, write to the Free Software*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/#include "huffman.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <assert.h>#ifdef WIN32
#include <malloc.h>
extern int getopt(int, char**, char*);
extern char* optarg;
#else
#include <unistd.h>
#endifstatic int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);static void
version(FILE *out)
{fputs("huffcode 0.3\n""Copyright (C) 2003 Douglas Ryan Richardson""; Gauss Interprise, Inc\n",out);
}static void
usage(FILE* out)
{fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n""-i - input file (default is standard input)\n""-o - output file (default is standard output)\n""-d - decompress\n""-c - compress (default)\n""-m - read file into memory, compress, then write to file (not default)\n",// step1: by yzhang, for huffman statistics"-t - output huffman statistics\n",//step1:end by yzhangout);
}int
main(int argc, char** argv)
{char memory = 0;char compress = 1;int opt;const char *file_in = NULL, *file_out = NULL;//step1:add by yzhang for huffman statisticsconst char *file_out_table = NULL;//end by yzhangFILE *in = stdin;FILE *out = stdout;//step1:add by yzhang for huffman statisticsFILE * outTable = NULL;//end by yzhang/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环，及查找括号对{switch(opt){case 'i':file_in = optarg;break;case 'o':file_out = optarg;break;case 'c':compress = 1;//压缩break;case 'd':compress = 0;//解压break;case 'h':usage(stdout);return 0;case 'v':version(stdout);return 0;case 'm':memory = 1;break;// by yzhang for huffman statisticscase 't':file_out_table = optarg;            break;//end by yzhangdefault:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in){in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out){out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}//by yzhang for huffman statisticsif(file_out_table){outTable = fopen(file_out_table, "w");if(!outTable){fprintf(stderr,"Can't open output file '%s': %s\n",file_out_table, strerror(errno));return 1;}}//end by yzhangif(memory){return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}if(compress)  //change by yzhanghuffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)elsehuffman_decode_file(in, out);if(in)fclose(in);if(out)fclose(out);if(outTable)fclose(outTable);return 0;
}static int
memory_encode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Encode the memory. */if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}static int
memory_decode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Decode the memory. */if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}

Huffman.c

/**  huffman - Encode/Decode files using Huffman encoding.*  http://huffman.sourceforge.net*  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc**  This library is free software; you can redistribute it and/or*  modify it under the terms of the GNU Lesser General Public*  License as published by the Free Software Foundation; either*  version 2.1 of the License, or (at your option) any later version.**  This library is distributed in the hope that it will be useful,*  but WITHOUT ANY WARRANTY; without even the implied warranty of*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU*  Lesser General Public License for more details.**  You should have received a copy of the GNU Lesser General Public*  License along with this library; if not, write to the Free Software*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"#ifdef WIN32
#include <winsock2.h>
#include <malloc.h>
#define alloca _alloca
#else
#include <netinet/in.h>
#endiftypedef struct huffman_node_tag
{unsigned char isLeaf;unsigned long count;struct huffman_node_tag *parent;union{struct{struct huffman_node_tag *zero, *one;};unsigned char symbol;};
} huffman_node;typedef struct huffman_code_tag
{/* The length of this code in bits. */unsigned long numbits;/* The bits that make up this code. The firstbit is at position 0 in bits[0]. The secondbit is at position 1 in bits[0]. The eighthbit is at position 7 in bits[0]. The ninthbit is at position 0 in bits[1]. */unsigned char *bits;
} huffman_code;//step2:add by yzhang for huffman statistics
//存放信源符号的信息:符号频率、比特数、符号码字
typedef struct huffman_statistics_result
{float freq[256];unsigned long numbits[256];unsigned char bits[256][100];
}huffman_stat;/*huffman_stat *init_huffstatistics()
{   huffman_stat *p;int i;p = (huffman_stat*)malloc(sizeof(huffman_stat));p->freq = (float *)malloc(sizeof(float)*256 );p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256);for (i=0 ; i<256;i++)p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100); return p;
}*/
//end by yzhang//将bit数转换为其对应的byte数,不能被8整除的部分要多分配一整个byte给它
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{return numbits / 8 + (numbits % 8 ? 1 : 0);
}/** get_bit returns the ith bit in the bits array* in the 0th position of the return value.*/
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{return (bits[i / 8] >> i % 8) & 1;
}//由于程序中从二叉树形成码字的过程是从叶到根的,所以需要bit反转函数来获得顺序正确的码字,同时以byte为单位对其进行规范化
//例:传入倒序码字为010111011,通过bit反转函数变为00000001 10111010
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{unsigned long numbytes = numbytes_from_numbits(numbits);unsigned char *tmp =(unsigned char*)alloca(numbytes);//alloca与malloc功能相似，但alloca会自动释放申请的空间unsigned long curbit;long curbyte = 0;memset(tmp, 0, numbytes);//将tmp指向空间的前numbytes个字节内容全部置0for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8;//如果一个byte写满了，就跳到下一个byte继续写if(curbit > 0 && curbit % 8 == 0)++curbyte;//通过get_bit函数从传入的bits里获得当前操作的比特结果,用移位运算将其移动到在一个byte里对应的位置//由于tmp的指向操作是以byte为单位的，这里只能通过按位取或(|=)来把bit一个一个写到tmp指向的空间里去//bit反转是靠numbits-curbit-1实现的tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);}memcpy(bits, tmp, numbytes);//把反转后的tmp写回到bits里
}/** new_code builds a huffman_code from a leaf in* a Huffman tree.*/
static huffman_code*
new_code(const huffman_node* leaf)
{/* Build the huffman code by walking up to* the root node and then reversing the bits,* since the Huffman code is calculated by* walking down the tree. */unsigned long numbits = 0;unsigned char* bits = NULL;huffman_code *p;//此段while循环的目的是从传入的叶结点开始向上进行寻根,得到该叶结点对应的码字while(leaf && leaf->parent){huffman_node *parent = leaf->parent;unsigned char cur_bit = (unsigned char)(numbits % 8);unsigned long cur_byte = numbits / 8;/* If we need another byte to hold the code,then allocate it. */if(cur_bit == 0){size_t newSize = cur_byte + 1;bits = (unsigned char*)realloc(bits, newSize);//把bits所占的空间大小调整为newSize个字节bits[newSize - 1] = 0; /* Initialize the new byte. */}/* If a one must be added then or it in. If a zero* must be added then do nothing, since the byte* was initialized to zero. */if(leaf == parent->one)//如果叶结点的地址等于该叶结点的爹妈的1孩子地址,则进行对应的移位操作bits[cur_byte] |= 1 << cur_bit;++numbits;leaf = parent;}if(bits)reverse_bits(bits, numbits);p = (huffman_code*)malloc(sizeof(huffman_code));p->numbits = numbits;p->bits = bits;return p;//p里包含了编完的码字、码字长度
}#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];//传入符号,建立其对应的叶结点,设置参数
static huffman_node*
new_leaf_node(unsigned char symbol)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 1;p->symbol = symbol;p->count = 0;p->parent = 0;return p;
}//建立一个非叶结点,并将它的0、1孩子地址设置为传入的0、1结点地址
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 0;p->count = count;p->zero = zero;p->one = one;p->parent = 0;return p;
}static void
free_huffman_tree(huffman_node *subtree)
{if(subtree == NULL)return;if(!subtree->isLeaf){free_huffman_tree(subtree->zero);free_huffman_tree(subtree->one);}free(subtree);
}static void
free_code(huffman_code* p)
{free(p->bits);free(p);
}static void
free_encoder(SymbolEncoder *pSE)
{unsigned long i;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*pSE)[i];if(p)free_code(p);}free(pSE);
}static void
init_frequencies(SymbolFrequencies *pSF)
{memset(*pSF, 0, sizeof(SymbolFrequencies));
#if 0unsigned int i;for(i = 0; i < MAX_SYMBOLS; ++i){unsigned char uc = (unsigned char)i;(*pSF)[i] = new_leaf_node(uc);}
#endif
}typedef struct buf_cache_tag
{unsigned char *cache;unsigned int cache_len;unsigned int cache_cur;unsigned char **pbufout;unsigned int *pbufoutlen;
} buf_cache;static int init_cache(buf_cache* pc,unsigned int cache_size,unsigned char **pbufout,unsigned int *pbufoutlen)
{assert(pc && pbufout && pbufoutlen);if(!pbufout || !pbufoutlen)return 1;pc->cache = (unsigned char*)malloc(cache_size);pc->cache_len = cache_size;pc->cache_cur = 0;pc->pbufout = pbufout;*pbufout = NULL;pc->pbufoutlen = pbufoutlen;*pbufoutlen = 0;return pc->cache ? 0 : 1;
}static void free_cache(buf_cache* pc)
{assert(pc);if(pc->cache){free(pc->cache);pc->cache = NULL;}
}static int flush_cache(buf_cache* pc)
{assert(pc);if(pc->cache_cur > 0){unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;unsigned char* tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;pc->cache_cur = 0;}return 0;
}static int write_cache(buf_cache* pc,const void *to_write,unsigned int to_write_len)
{unsigned char* tmp;assert(pc && to_write);assert(pc->cache_len >= pc->cache_cur);/* If trying to write more than the cache will hold* flush the cache and allocate enough space immediately,* that is, don't use the cache. */if(to_write_len > pc->cache_len - pc->cache_cur){unsigned int newlen;flush_cache(pc);newlen = *pc->pbufoutlen + to_write_len;tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;}else{/* Write the data to the cache. */memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);pc->cache_cur += to_write_len;}return 0;
}//为信源符号建立叶结点,统计次数
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{int c;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */while((c = fgetc(in)) != EOF){unsigned char uc = c;if(!(*pSF)[uc])//如果第一次遇到这个符号,则新建该符号的叶结点(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;//对所有符号出现的次数分别进行计数++total_count;}return total_count;
}static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,const unsigned char *bufin,unsigned int bufinlen)
{unsigned int i;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;++total_count;}return total_count;
}/** When used by qsort, SFComp sorts the array so that* the symbol with the lowest frequency is first. Any* NULL entries will be sorted to the end of the list.*/
static int
SFComp(const void *p1, const void *p2)
{const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;/* Sort all NULLs to the end. */if(hn1 == NULL && hn2 == NULL)return 0;if(hn1 == NULL)return 1;if(hn2 == NULL)return -1;if(hn1->count > hn2->count)return 1;else if(hn1->count < hn2->count)return -1;return 0;
}#if 1
static void
print_freqs(SymbolFrequencies * pSF)
{size_t i;for(i = 0; i < MAX_SYMBOLS; ++i){if((*pSF)[i])printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);elseprintf("NULL\n");}
}
#endif/** build_symbol_encoder builds a SymbolEncoder by walking* down to the leaves of the Huffman tree and then,* for each leaf, determines its code.*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{if(subtree == NULL)return;//如果传入的结点是叶结点,对其进行编码并存放在对应的指针指向的空间里;如果不是,用递归方法不断调用自身传入该结点的左、右孩子,直到叶结点if(subtree->isLeaf)(*pSF)[subtree->symbol] = new_code(subtree);else{   //递归build_symbol_encoder(subtree->zero, pSF);build_symbol_encoder(subtree->one, pSF);}
}/** calculate_huffman_codes turns pSF into an array* with a single entry that is the root of the* huffman tree. The return value is a SymbolEncoder,* which is an array of huffman codes index by symbol value.*/
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;#if 1printf("BEFORE SORT\n");print_freqs(pSF);   //演示堆栈的使用
#endif/* Sort the symbol frequency array by ascending frequency. *///qsort是自带的快速排序函数,参数为待排序数组的首地址(*pSF),排序元素数量(MAX_SYMBOLS),每个元素的长度(sizeof((*pSF)[0])),自定义的比较函数(SFComp,返回1则前〉后,-1则后〉前)qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);   //讲解SFComp函数的作用，断点在调试程序里的作用#if 1   printf("AFTER SORT\n");print_freqs(pSF);
#endif/* Get the number of symbols. */for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n);/** Construct a Huffman tree. This code is based* on the algorithm given in Managing Gigabytes* by Ian Witten et al, 2nd edition, page 34.* Note that this implementation uses a simple* count instead of probability.*/for(i = 0; i < n - 1; ++i){/* Set m1 and m2 to the two subsets of least probability. */m1 = (*pSF)[0];m2 = (*pSF)[1];/* Replace m1 and m2 with a set {m1, m2} whose probability* is the sum of that of m1 and m2. */(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);(*pSF)[1] = NULL;/* Put newSet into the correct count position in pSF. */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);}/* Build the SymbolEncoder array from the tree. */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);return pSE;
}/** Write the huffman code table. The format is:* 4 byte code count in network byte order.* 4 byte number of bytes encoded*   (if you decode the data, you should get this number of bytes)* code1* ...* codeN, where N is the count read at the begginning of the file.* Each codeI has the following format:* 1 byte symbol, 1 byte code bit length, code bytes.* Each entry has numbytes_from_numbits code bytes.* The last byte of each code may have extra bits, if the number of* bits in the code is not a multiple of 8.*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，//因此big-endian作为network byte order，little-endian作为host byte order。//little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变if(fwrite(&i, sizeof(i), 1, out) != 1)return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* Write the 1 byte symbol. */fputc((unsigned char)i, out);/* Write the 1 byte code bit length. */fputc(p->numbits, out);/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);if(fwrite(p->bits, 1, numbytes, out) != numbytes)return 1;}}return 0;
}/** Allocates memory and sets *pbufout to point to it. The memory* contains the code table.*/
static int
write_code_table_to_memory(buf_cache *pc,SymbolEncoder *se,unsigned int symbol_count)
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);if(write_cache(pc, &i, sizeof(i)))return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(write_cache(pc, &symbol_count, sizeof(symbol_count)))return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* The value of i is < MAX_SYMBOLS (256), so it canbe stored in an unsigned char. */unsigned char uc = (unsigned char)i;/* Write the 1 byte symbol. */if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the 1 byte code bit length. */uc = (unsigned char)p->numbits;if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);if(write_cache(pc, p->bits, numbytes))return 1;}}return 0;
}/** read_code_table builds a Huffman tree from the code* in the in file. This function returns NULL on error.* The returned value should be freed with free_huffman_tree.*/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{//在解码端重建huffman树huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1){free_huffman_tree(root);return NULL;}count = ntohl(count);//将一个无符号长整形数从网络字节顺序转换为主机字节顺序/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){int c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF)//读取符号并判断{free_huffman_tree(root);return NULL;}symbol = (unsigned char)c;if((c = fgetc(in)) == EOF)//读取字符长度并判断{free_huffman_tree(root);return NULL;}numbits = (unsigned char)c;numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(fread(bytes, 1, numbytes, in) != numbytes){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}static int
memread(const unsigned char* buf,unsigned int buflen,unsigned int *pindex,void* bufout,unsigned int readlen)
{assert(buf && pindex && bufout);assert(buflen >= *pindex);if(buflen < *pindex)return 1;if(readlen + *pindex >= buflen)return 1;memcpy(bufout, buf + *pindex, readlen);*pindex += readlen;return 0;
}static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,unsigned int bufinlen,unsigned int *pindex,unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(memread(bufin, bufinlen, pindex, &count, sizeof(count))){free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))){free_huffman_tree(root);return NULL;}if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))){free_huffman_tree(root);return NULL;}numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(memread(bufin, bufinlen, pindex, bytes, numbytes)){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;int c;while((c = fgetc(in)) != EOF){unsigned char uc = (unsigned char)c;huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){fputc(curbyte, out);curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/if(curbit > 0)//写最后一个符号没写满8bit的情况fputc(curbyte, out);return 0;
}static int
do_memory_encode(buf_cache *pc,const unsigned char* bufin,unsigned int bufinlen,SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;unsigned int i;for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){if(write_cache(pc, &curbyte, sizeof(curbyte)))return 1;curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}//step3:add by yzhang for huffman statistics
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{int i,count =0;for(i = 0; i < MAX_SYMBOLS; ++i){   if((*SF)[i]){st->freq[i]=(float)(*SF)[i]->count/total_count;count+=(*SF)[i]->count;}else {st->freq[i]= 0;}}if(count==total_count)return 1;elsereturn 0;
}int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{unsigned long i,j;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;st->numbits[i] = p->numbits;numbytes = numbytes_from_numbits(p->numbits);for (j=0;j<numbytes;j++)st->bits[i][j] = p->bits[j];}elsest->numbits[i] =0;}return 0;
}void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{int i,j;unsigned char c;fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");for(i = 0; i < MAX_SYMBOLS; ++i){   fprintf(out_Table,"%d\t   ",i);fprintf(out_Table,"%f\t   ",st->freq[i]);fprintf(out_Table,"%d\t    ",st->numbits[i]);if(st->numbits[i]){for(j = 0; j < st->numbits[i]; ++j){c =get_bit(st->bits[i], j);fprintf(out_Table,"%d",c);}}fprintf(out_Table,"\n");}
}
//end by yzhang
/** huffman_encode_file huffman encodes in to out.*/
int
huffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;//step2:add by yzhang for huffman statisticshuffman_stat hs;//end by yzhang/* Get the frequency of each symbol in the input file. */symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后，SF指针数组的每个元素的构成//step3:add by yzhang for huffman statistics,...  get the frequency of each symbol huffST_getSymFrequencies(&sf,&hs,symbol_count);//end by yzhang/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];//step3:add by yzhang for huffman statistics... output the statistics to filehuffST_getcodeword(se, &hs);output_huffman_statistics(&hs,out_Table);//end by yzhang/* Scan the file again and, using the tablepreviously built, encode it into the output file. */rewind(in);rc = write_code_table(out, se, symbol_count);if(rc == 0)rc = do_file_encode(in, out, se);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);return rc;
}int
huffman_decode_file(FILE *in, FILE *out)
{huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);if(!root)return 1;/* Decode the file. */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF){unsigned char byte = (unsigned char)c;unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree(root);return 0;
}#define CACHE_SIZE 1024int huffman_encode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;buf_cache cache;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))return 1;/* Get the frequency of each symbol in the input memory. */symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];/* Scan the memory again and, using the tablepreviously built, encode it into the output memory. */rc = write_code_table_to_memory(&cache, se, symbol_count);if(rc == 0)rc = do_memory_encode(&cache, bufin, bufinlen, se);/* Flush the cache. */flush_cache(&cache);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);free_cache(&cache);return rc;
}int huffman_decode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{huffman_node *root, *p;unsigned int data_count;unsigned int i = 0;unsigned char *buf;unsigned int bufcur = 0;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;/* Read the Huffman code table. */root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);if(!root)return 1;buf = (unsigned char*)malloc(data_count);/* Decode the memory. */p = root;for(; i < bufinlen && data_count > 0; ++i) {unsigned char byte = bufin[i];unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){buf[bufcur++] = p->symbol;p = root;--data_count;}}}free_huffman_tree(root);*pbufout = buf;*pbufoutlen = bufcur;return 0;
}

三、结果分析

实验选取了10中文件类型进行Huffman编码，分别为bmp、doc、exe、pdf、png、ppt、rar、wav、xls、yuv。对编码后的文件进行分析，得到以下结果图表：

可以看到，进行Huffman编码后，大多数文件都变小了，压缩比在1到4之间。但也有rar这样经过编码后不小反大的文件。

再观察每个文件的字符概率分布情况：

对比联合图表可以发现，压缩比是由概率分布决定的。相比于实验选用的bmp、doc等字符概率比较集中的文件，字符概率分布平均分散的文件（如rar、png、pdf），压缩比更小，信源熵更大。

实验三 Huffman编解码算法实现与压缩效率分析相关推荐

数据压缩实验三 Huffman编解码算法实现与压缩效率分析
实验目的掌握Huffman编解码实现的数据结构和实现框架, 进一步熟练使用C编程语言, 并完成压缩效率的分析. 实验原理 1．本实验中Huffman编码算法 (1)将文件以ASCII字符流的形式读入 ...
数据压缩原理实验三 Huffman编解码算法实现与压缩效率分析
实验原理 Huffman编码是一种无失真编码方式,是一种可变长编码,它将出现概率大的信源符号短编码,出现概率小的信源符号长编码. 编码步骤: ①将文件以ASCII字符流的形式读入,统计每个符号的发生概 ...
Huffman 编解码算法实现与压缩效率分析
一.实验原理 1 熵,又称为"信息熵" (Entropy) 1.1 在信息论中,熵是信息的度量单位.信息论的创始人 Shannon 在其著作<通信的数学理论>中提出了 ...
实验三 LZW编解码算法实现与分析
LZW简述本部分参考wiki https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch LZW压缩算法在1978年提出,由 Abr ...
实验三—Huffman编解码
一.实验原理 1.Huffman编码的步骤: (1)首先将所有字符发生的概率从小到大进行排序: (2)将最小的两个概率进行两两一合并,之后继续找最小的两个概率进行合并包括前面已经合并的和数: (3)一 ...
[实验三]LZW 编解码算法实现与分析
目录一.LZW算法 1.1 编码步骤 1.2 解码步骤 1.3 关于有可能出现当前码字CW不在词典中的情况说明二.代码实现 2.1 程序说明 2.2 数据结构 2.3 bitio.h 2.4 bi ...
数据压缩实验三--Huffman编解码及压缩率的比较
一,Huffman码 1 Huffman 编码 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman编码是可变字长编码(VLC)的一种. Huffman 编码基于信源 ...
huffman编解码算法实验与压缩效率分析
一.基本原理 1.huffman编码原理 huffman编码是一种无失真编码方式,是可变长(VLC)编码的一种. huffman编码基于信源的概率统计模型,基本思路是出现概率大的信源符号编长码,出现概 ...
实验三 LZW编解码实验
一.LZW算法简介 LZW为词典编码的一种,是通过从输入数据中创建"短语词典".在编码过程中遇到词典中出现的"短语"时,编码器就输出其对应的"序号&q ...

实验三 Huffman编解码算法实现与压缩效率分析