Huffman编解码完全注释

/**  huffman - Encode/Decode files using Huffman encoding.*  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc**  This library is free software; you can redistribute it and/or*  modify it under the terms of the GNU Lesser General Public*  License as published by the Free Software Foundation; either*  version 2.1 of the License, or (at your option) any later version.**  This library is distributed in the hope that it will be useful,*  but WITHOUT ANY WARRANTY; without even the implied warranty of*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU*  Lesser General Public License for more details.**  You should have received a copy of the GNU Lesser General Public*  License along with this library; if not, write to the Free Software*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*//** 针对可打印字符的huffman编解码.*/#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"#ifdef WIN32
#include <winsock2.h>
#include <winsock.h>
#include <malloc.h>
#define alloca _alloca
#else
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#endif// ---------------------------------------------------------------------------------------
/* 描述huffman树的节点 */
typedef struct huffman_node_tag
{unsigned char        isLeaf;       // 当前节点是否为叶子节点unsigned long        count;     // 权(字符出现的次数)struct huffman_node_tag *parent;   // 当前节点的父亲union {                           /** 如果改节点为叶子节点, 则有unsigned char symbol字段,* 如果该节点不是叶子节点, 则具有左孩子右孩子字段.*/struct {struct huffman_node_tag *zero;  // 左孩子struct huffman_node_tag *one;   // 右孩子};unsigned char symbol;    // 存储的字符.};
} huffman_node;
/* 编码 */
typedef struct huffman_code_tag
{/** numbits 这个字段是用来记录从leaf--->root一共走了多少步.* 也就是叶子节点所含字符的编码长度* bits 是用来存储编码的, 但是它是以Byte为单位的, 而编码是* 以bit为单位的, 所以需要根据 numbits 去从 bits 里提取出前* numbits 个bit。 这才是真正的编码.** 比如* numbits=4* bits = 0110 1011* 那么我们应该提取bits的低4位 1011** 注意 numbits 和 bits是有关系的, bits一定不可能超过 numbits/8 *//* The length of this code in bits. */unsigned long numbits;/* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 *//* The bits that make up this code. The firstbit is at position 0 in bits[0]. The secondbit is at position 1 in bits[0]. The eighthbit is at position 7 in bits[0]. The ninthbit is at position 0 in bits[1]. */unsigned char *bits;     /* 用来存储编码String */
} huffman_code;int
main(int argc, char** argv)
{/*char       memory = 0;char       compress = 1;int        opt;const char    *file_in = NULL;const char  *file_out = NULL;*///FILE     *in = stdin;//FILE     *out = stdout;FILE       *in = NULL;FILE       *out = NULL;FILE       *out2 = NULL;in = fopen("test.txt", "r+");out = fopen("test1.txt", "w+");//out2 = fopen("test2.txt", "w+");int result = 0;result = huffman_encode_file(in, out);//result = huffman_decode_file(out, out2);printf("result : %d/n", result);system("pause");return 1;
}// ---------------------------------------------------------------------------------------
/*** bit-->byte转换, 不足8位则补齐8位.*/
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{return numbits / 8 + (numbits % 8 ? 1 : 0);
}/** get_bit returns the ith bit in the bits array* in the 0th position of the return value.* 取出第i位(从低位往高位排)*/
static unsigned char
get_bit(unsigned char *bits, unsigned long i)
{return ( bits[i/8] >> (i%8) ) & 1;
}/*** 反转数组的前((numbits/8)+1)个字符**/
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{unsigned long numbytes = numbytes_from_numbits(numbits);    // 所占字节数.unsigned char *tmp = (unsigned char*)calloc(numbytes, sizeof(unsigned char));    // 分配内存unsigned long curbit;           // index -- 当前位long          curbyte = 0;  // 当前是byte的indexmemset(tmp, 0, numbytes);   // 将tmp指向的buffer清零/***/for(curbit=0; curbit<numbits; ++curbit) {unsigned int bitpos = curbit % 8;  // 当前byte里的indexif(curbit>0 && curbit%8==0) {      //++curbyte;}/** 按位 OR**/tmp[curbyte] |= ( get_bit(bits, numbits-curbit-1) << bitpos );}memcpy(bits, tmp, numbytes);
}/** new_code builds a huffman_code from a leaf in* a Huffman tree.* 对指定的叶子进行编码.*/
static huffman_code*
new_code(const huffman_node* leaf)
{/* Build the huffman code by walking up to* the root node and then reversing the bits,* since the Huffman code is calculated by* walking down the tree. */unsigned long numbits = 0;unsigned char *bits = NULL;huffman_code  *p;while(leaf!=NULL && leaf->parent!=NULL) {huffman_node *parent = leaf->parent;   // 当前活动节点的双亲unsigned long cur_byte   = numbits / 8;    // 当前byteunsigned char cur_bit    = (unsigned char)(numbits % 8); // 当前byte里的当前bit/* If we need another byte to hold the code,then allocate it. */if(cur_bit == 0) {   // 满了一个byte就需要重新分配内存size_t newSize = cur_byte + 1;bits = (char*)realloc(bits, newSize);bits[newSize - 1] = 0; /* Initialize the new byte. */}/** ************************************************************************* 需要注意的是右孩子的处理, 因为左孩子已经是0, 初始分配bits的时候就已经把每位都设置为0了.* *************************************************************************//* If a one must be added then or it in. If a zero* must be added then do nothing, since the byte* was initialized to zero. */if(leaf == parent->one) {/** 右孩子的话就需要把当前位置为1*/bits[cur_byte] |= (1<<cur_bit);}++numbits;leaf = parent;}if( bits != 0) {/** 如果编码里头含有1, 则需要进行反转, 如果全为0, 反转后和反转前是一样的, 就没必要反转了*/reverse_bits(bits, numbits);}p = (huffman_code*)malloc(sizeof(huffman_code));p->numbits    = numbits; /* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */p->bits    = bits;       /* 用来存储编码的区间 */return p;
}/*** 创建一个孤立的"叶子"节点, 该节点为一个单独的树* symbol : 该叶子节点的权,即字符*/
static huffman_node*
new_leaf_node(unsigned char symbol)
{huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );p->isLeaf = 1;p->symbol = symbol;p->count = 0;p->parent = 0;return p;
}/*** 创建节点(该节点不是叶子节点)** count  : 字符出现的次数* zero   : 左孩子* one    : 右兄弟** return : 返回一个节点对象*/
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );p->isLeaf = 0;p->count = count;p->zero = zero;p->one = one;p->parent = 0;return p;
}/*** 释放树占用的内存空间*/
static void
free_huffman_tree(huffman_node *subtree)
{if(subtree == NULL)return;   if( !(subtree->isLeaf) ) {/** 先序遍历进行递归调用*/free_huffman_tree( subtree->zero );free_huffman_tree( subtree->one );}free( subtree );
}
/*** free code*/
static void
free_code(huffman_code* p)
{free(p->bits);free(p);
}// ----------------------------------------------------------------------------------------
#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];   /*  */
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];       /*  */
/*  */
static void
free_encoder(SymbolEncoder *pSE)
{unsigned long i;for(i = 0; i < MAX_SYMBOLS; ++i) {huffman_code *p = (*pSE)[i];if( p )    free_code(p);}
}
/*  */
static void
init_frequencies(SymbolFrequencies *pSF)
{memset(*pSF, 0, sizeof(SymbolFrequencies) );  /* 清零 */
}// ----------------------------------------------------------------------------------------
typedef struct buf_cache_tag
{/** 该结构主要描述了两个部分, 一个是cache, 一个是bufout.* cache是一个临时存储数据的buffer, cache会将数据写往bufout区间,* bufout类似一个仓库, 会一直存储cache写入的数据.* cache可以多次网bufout内写数据, bufout会一直保存这些数据.* bufout是一个动态的buffer, cache每一次往bufout内写数据的时候bufout都需要realloc一次.*/unsigned char *cache;       // 指向真正存储数据的bufferunsigned int cache_len; // buffer的长度, 初始的时候就可以设置 cache的大小的unsigned int cache_cur; // 数据结尾处(或者说是光标位置)unsigned char **pbufout; /** cache要写数据就往这个空间内写(类似一个动态仓库, 一定是动态的)* (*pbufout)就是真实的存储区*/unsigned int *pbufoutlen;  // 仓库的大小
} buf_cache;
/* 初始化一个buf_cache */
static int init_cache(buf_cache    *pc,unsigned int    cache_size,unsigned char **pbufout,unsigned int    *pbufoutlen)
{assert(pc && pbufout && pbufoutlen);if(!pbufout || !pbufoutlen) return 1;pc->cache     = (unsigned char*)malloc(cache_size);  // 分配存储空间pc->cache_len = cache_size; //pc->cache_cur = 0;       // 光标从0开始pc->pbufout   = pbufout; //*pbufout      = NULL;       //pc->pbufoutlen    = pbufoutlen;*pbufoutlen   = 0;       //return (pc->cache==NULL ? 0 : 1);
}/* 释放buf_cache */
static void free_cache(buf_cache* pc)
{assert( pc );if( pc->cache != NULL){/** 我觉得这里没有必要free( pc->cache );, 直接执行pc->cache = NULL;就可以保证* 逻辑上清cache了, 至于真实的存储区内是否仍然有数据并没有意义.*/free( pc->cache );pc->cache = NULL;}
}
/** 将cache内的数据写到pbufout中去, 并清洗cache* 成功则返回0, 失败返回1.*/
static int flush_cache(buf_cache* pc)
{assert( pc );if(pc->cache_cur > 0)    // 确定cache_cur有平移, 这样才能确定cache内有数据. 才可以flush{/* 当前要写的数据长度为pc->cache_cur, 原来的bufout里头本身还有*(pc->pbufoutlen)长度的数据 */unsigned int newlen = pc->cache_cur + *(pc->pbufoutlen);/* 需要重新为*(pc->pbufout)分配空间, tmp为这个新buffer的首地址 */unsigned char*    tmp = realloc(*(pc->pbufout), newlen);if( !tmp ) return 1;/* 追加到pbufout结尾处, 而不是覆盖 */memcpy(tmp + *(pc->pbufoutlen), pc->cache, pc->cache_cur);*pc->pbufout = tmp;         // pbufout指针重定位到新的扩大了的内存区.  *pc->pbufoutlen = newlen;   // 重新计算pbufoutlenpc->cache_cur = 0;          // cache逻辑上清零}return 0;
}
/* 写cache */
static int write_cache(buf_cache* pc,const void *to_write,unsigned int to_write_len)
{unsigned char* tmp;assert(pc && to_write);assert(pc->cache_len >= pc->cache_cur);/* If trying to write more than the cache will hold* flush the cache and allocate enough space immediately,* that is, don't use the cache. */if(to_write_len > pc->cache_len - pc->cache_cur){/** to_write_len : 需要往cache内写的数据长度* pc->cache_len-pc->cache_cur : cache内剩余空间* 如果cache存储能力不够则先清洗cache, 再将数据直接写到* pbufout中去, 不使用cache.*/unsigned int newlen;flush_cache( pc );newlen = *pc->pbufoutlen + to_write_len;tmp = realloc(*pc->pbufout, newlen);if( !tmp ) return 1;memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;}else{/** Write the data to the cache* 如果cache存储能力足够,则往cache内追加数据且将当前光标移动到新的数据尾部.*/memcpy(pc->cache+pc->cache_cur, to_write, to_write_len);pc->cache_cur += to_write_len;}return 0;
}// -------------------------------------------------------------------------------------
/** 计算FILE对象内的各个字符出现的频率* 扫描FILE对象,*/
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{int c;unsigned int total_count = 0;   // FILE对象内的字符总数init_frequencies( pSF ); /* Set all frequencies to 0. *//* Count the frequency of each symbol in the input file. */while( (c=fgetc(in)) != EOF ){unsigned char uc = c;if( !(*pSF)[uc] ) // 如果这个字符没有出现过则为这个字符建立一个叶子{/** 这里设计的相当有意思,* fgetc会获得当前光标所在的字符, 这个字符必然是一个char, 也就是一个* unsinged int型数.* 我们前面有定义* #define MAX_SYMBOLS 256* typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];* typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];* 这里我们采用SymbolFrequencies[uc]就存储c这个字符.* 也就是说* SymbolFrequencies[0]==0x00* SymbolFrequencies[1]==0x01*  ......* SymbolFrequencies[65]==A  ( A==65 :) )* SymbolFrequencies[66]==B  ( B==66 :) )*  ......* 起初在typedf SymbolFrequencies的时候, 我着实没看懂, 实在太巧妙了.*/(*pSF)[uc] = new_leaf_node( uc );}/** 这个自加也非常巧妙, 遇到uc字符则将第uc个huffman_node的count自加* 实际就是一个字符一个字符读下去,,* 读到了A就 ++(pSF['A'].count) == ++(pSF[65].count)* 读到了B就 ++(pSF['B'].count) == ++(pSF[66].count)* 真TNND的牛X!*/++( (*pSF)[uc]->count );++total_count;}return total_count;
}
/* 计算buffer内各个字符的频率,和get_symbol_frequencies函数同理 */
static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies    *pSF,const unsigned char    *bufin,unsigned int       bufinlen)
{unsigned int i;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];if( !(*pSF)[uc] ){(*pSF)[uc] = new_leaf_node(uc);}++(*pSF)[uc]->count;++total_count;}return total_count;
}/** When used by qsort, SFComp sorts the array so that* the symbol with the lowest frequency is first. Any* NULL entries will be sorted to the end of the list.** 两个huffman_node进行对比, 以count作为比较依据.* 即对比两个不同 symbol 出现的频率* 非叶子节点通通排在后面*/
static int
SFComp(const void *p1, const void *p2)
{const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;/* Sort all NULLs to the end. */if(hn1 == NULL && hn2 == NULL)     return 0;if(hn1 == NULL)                    return 1;if(hn2 == NULL)                    return -1;if(hn1->count > hn2->count)        return 1;else if(hn1->count < hn2->count)   return -1;return 0;
}/** build_symbol_encoder builds a SymbolEncoder by walking* down to the leaves of the Huffman tree and then,* for each leaf, determines its code.** 递归方式为每个叶子节点编码*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSE)
{if(subtree == NULL)  return;if( subtree->isLeaf ){(*pSE)[subtree->symbol] = new_code( subtree );}else{build_symbol_encoder(subtree->zero, pSE);build_symbol_encoder(subtree->one, pSE);}
}/** calculate_huffman_codes turns pSF into an array* with a single entry that is the root of the* huffman tree. The return value is a SymbolEncoder,* which is an array of huffman codes index by symbol value.** 为每个node编码. 这个函数比较重要, 精华就是在这个函数里头的for循环. 哈哈* 整个tree的建立全都依赖这个函数*/
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{unsigned int i = 0;unsigned int n = 0;huffman_node *m1  = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;/** Sort the symbol frequency array by ascending frequency.* 快速排序例程进行排序* 以symbol频率为关键字做升序排列* 有symbol的节点都会按升序排列, 没有symbol的节点会统一排在后面,* 通过一个for就能计算出symbol的个数了.*/qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);/** Get the number of symbols.* 计算huffman树中的字符数, 这个实现可读性不够好*/for(n = 0; (n<MAX_SYMBOLS) && (*pSF)[n]; ++n);/** Construct a Huffman tree. This code is based* on the algorithm given in Managing Gigabytes* by Ian Witten et al, 2nd edition, page 34.* Note that this implementation uses a simple* count instead of probability.*/for(i = 0; i < (n-1); ++i){/* Set m1 and m2 to the two subsets of least probability. */m1 = (*pSF)[0];m2 = (*pSF)[1];/* Replace m1 and m2 with a set {m1, m2} whose probability* is the sum of that of m1 and m2.* 这个算法有优化的余地的, 因为n在一直减小.* 将最小的两个元素合并后得到一个一个节点为m12, 此时m1,m2已经建立起来了关系.* 这个m12的地址又被pSF[0]存储, 循环直至整个Tree建立成功.* 指针在这里运用的实在太巧妙了.* 这一行代码就是建树, 靠，NBA!*/(*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);(*pSF)[1] = NULL;/** Put newSet into the correct count position in pSF.* 这里应该可以再进行优化, 是否有必要再进行排序, 或者被排序的数组过长了.* 实际上每循环一次n都减少了一次*/qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);}/* for完毕的时候就求出了root, pSF[0]就是root, 后面的元素都是NULL* 而树通过for循环里头的* (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);* 已经建立完成了*//* Build the SymbolEncoder array from the tree. */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);return pSE;
}/** Write the huffman code table. The format is:* 4 byte code count in network byte order.* 4 byte number of bytes encoded*   (if you decode the data, you should get this number of bytes)* code1* ...* codeN, where N is the count read at the begginning of the file.* Each codeI has the following format:* 1 byte symbol, 1 byte code bit length, code bytes.* Each entry has numbytes_from_numbits code bytes.* The last byte of each code may have extra bits, if the number of* bits in the code is not a multiple of 8.** 编码后的格式 :* 0-3个byte是FILE内出现的不同字符个数(几不同的字符个数)* 4-7个byte是FILE内出现的全部字符个数(所有的字符)* 8-X是真正的编码后值**/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{unsigned long i, count = 0;/** Determine the number of entries in se* 计算 SymbolEncoder 内具有编码值的元素个数.* 即有几种字符*/for(i = 0; i < MAX_SYMBOLS; ++i)if( (*se)[i] )++count;/** Write the number of entries in network byte order.* 将字符种数写入到文件头部, 即[0, 3]一共4个字节*///i = htonl( count );i = count;if(fwrite(&i, sizeof(i), 1, out) != 1) return 1;/** Write the number of bytes that will be encoded.* 将字符个数追加到[4,7]一共4个字节*///symbol_count = htonl(symbol_count);symbol_count = symbol_count;if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)   return 1;/** Write the entries.*/for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if( p != NULL ){   /** 每个单元分为三个部分 : * symbol  -- 字符* numbits  -- 叶子走到root需要的步数* bits    -- 叶子走到root的方式(即最终的编码, 比如说0101)*/unsigned int numbytes;/* Write the 1 byte symbol. */fputc((unsigned char)i, out);  /* Write the 1 byte code bit length. */fputc(p->numbits, out);/* Write the code bytes. 她这个注释就没有说是几byte, 值得思考一下 */numbytes = numbytes_from_numbits( p->numbits );/* 将叶子走到root的方式写进去, 这个方式会被整理为byte格式, 不够就补0 */if(fwrite(p->bits, 1, numbytes, out) != numbytes)    return 1;}}return 0;
}/** Allocates memory and sets *pbufout to point to it. The memory* contains the code table.** 以指定的格式将编码后的数据写入到cache中去, 实际是写到pbufout中去了.**/
static int
write_code_table_to_memory(buf_cache      *pc,SymbolEncoder   *se,unsigned int    symbol_count)
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i]){++count;   // 计算不同字符的个数}}/* Write the number of entries in network byte order. *///i = htonl(count);i = count;if( write_cache(pc, &i, sizeof(i)) )   // 前四个字节是memory内所有字符数return 1;/* Write the number of bytes that will be encoded. *///symbol_count = htonl(symbol_count);symbol_count = symbol_count;if( write_cache(pc, &symbol_count, sizeof(symbol_count)) )  // 4-8字节是不同字符个数return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if( p ){/** 对于每次循环来说, 如果p不为NULL, 则将该字符对应的编码写入到cache内.* 存储格式为三个字节作为一个单位.* byte0 --- 字符本身* byte1 --- 该字符编码后的码值长度(即2进制的位数)* byte2 --- 该字符对应的码值*/unsigned int numbytes;/** The value of i is < MAX_SYMBOLS (256), so it can* be stored in an unsigned char.* 将i转换为char型, 可以对应到字符集*/unsigned char uc = (unsigned char)i;/** Write the 1 byte symbol.* 将字符写到cache内*/if(write_cache(pc, &uc, sizeof(uc)))   return 1;/** Write the 1 byte code bit length.* 将叶子节点到root所需要经过的步数写到cache内, 也就是编码的长度* 这个数据是为了解码使用的.*/uc = (unsigned char)p->numbits;if(write_cache(pc, &uc, sizeof(uc)))   return 1;/** Write the code bytes.* 将编码值对齐并写如到cache内* 事先必须知道编码由几位组成, 如果编码为9位, 那么就需要2个byte来存储这个码值* 如果编码为4位, 那么就需要1个byte来存储了,*/numbytes = numbytes_from_numbits(p->numbits);if(write_cache(pc, p->bits, numbytes)) return 1;}}return 0;
}/** read_code_table builds a Huffman tree from the code* in the in file. This function returns NULL on error.* The returned value should be freed with free_huffman_tree.***/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/** Read the number of entries.* (it is stored in network byte order).* 获得字符种数, 前2个byte就是出现的字符种数*/if( fread(&count, sizeof(count), 1, in) != 1 ){free_huffman_tree( root );return NULL;}//count = ntohl(count);count = count;/** Read the number of data bytes this encoding represents.* 一个有多少个字符*/if( fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1 ){free_huffman_tree(root);return NULL;}//*pDataBytes = ntohl(*pDataBytes);*pDataBytes = *pDataBytes;/* Read the entries. */while(count-- > 0){int           c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if( (c=fgetc(in)) == EOF ){free_huffman_tree( root );return NULL;}symbol = (unsigned char)c;if( (c=fgetc(in)) == EOF ){free_huffman_tree( root );return NULL;}numbits    = (unsigned char)c;numbytes   = (unsigned char)numbytes_from_numbits( numbits );bytes      = (unsigned char*)malloc( numbytes );if( fread(bytes, 1, numbytes, in) != numbytes ){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one =curbit == (unsigned char)(numbits-1) ?new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero =curbit == (unsigned char)(numbits - 1) ?new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}
/** 将数据从buf读到bufout中, 成功则返回0, 其他则返回1.* pindex  -- 拷贝的起点*/
static int
memread(const unsigned char*    buf,unsigned int         buflen,unsigned int         *pindex,void*                bufout,unsigned int         readlen)
{assert(buf && pindex && bufout);assert(buflen >= *pindex);// 错误if(buflen < *pindex)        return 1;if(readlen + *pindex >= buflen) return 1;memcpy(bufout, buf + *pindex, readlen);*pindex += readlen;return 0;
}
/** 从编码后的buf内读数据.*/
static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,unsigned int         bufinlen,unsigned int         *pindex,unsigned int         *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/** Read the number of entries.* (it is stored in network byte order).* 读取*/if( memread(bufin, bufinlen, pindex, &count, sizeof(count)) ){free_huffman_tree(root);return NULL;}//count = ntohl(count);count = count;/* Read the number of data bytes this encoding represents. */if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))){free_huffman_tree(root);return NULL;}//*pDataBytes = ntohl(*pDataBytes);*pDataBytes = *pDataBytes;/* Read the entries. */while( (count--) > 0 ){unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))){free_huffman_tree(root);return NULL;}if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))){free_huffman_tree(root);return NULL;}numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(memread(bufin, bufinlen, pindex, bytes, numbytes)){free(bytes);free_huffman_tree(root);return NULL;}/** Add the entry to the Huffman tree. The value* of the current bit is used switch between* zero and one child nodes in the tree. New nodes* are added as needed in the tree.*/for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = ( curbit==(unsigned char)(numbits - 1) ) ?new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}
/** 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作* 也就是不将编码强制为byte类型了, 而是直接写入到out中.*/
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit  = 0;int c;while( (c = fgetc(in)) != EOF){//printf("c=%c   /n", c);unsigned char uc     = (unsigned char)c;huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){//printf("i=%d   /n", i);/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){/** 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作* 也就是不将编码强制为byte类型了, 而是直接写入到out中.*/fputc(curbyte, out);printf("curbyte=%d/n", curbyte);// printf("curbyte=%d   /n", curbyte);curbyte = 0;curbit = 0;}}printf("/n");}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/if(curbit > 0)fputc(curbyte, out);return 0;
}
/** 对memory进行编码.** pc      -- cache* bufin   -- 待编码的buffer* bufinlen   -- buffer长, 即字符个数* se      --*/
static int
do_memory_encode(buf_cache             *pc,const unsigned char *bufin,unsigned int         bufinlen,SymbolEncoder           *se)
{unsigned char curbyte    = 0;       //unsigned char curbit = 0;unsigned int i;/* 对 bufin 内的字符依次循环 */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];huffman_code *code = (*se)[uc]; // 取出第i个字符的编码unsigned long i;           /* 对第i个字符编码长度进行循环 */for(i = 0; i < code->numbits; ++i){/** Add the current bit to curbyte.* 依次取出*/curbyte |= ( get_bit(code->bits, i) << curbit );/** If this byte is filled up then write it* out and reset the curbit and curbyte*/if(++curbit == 8){/** 满了一个字节则写cache**/if(write_cache(pc, &curbyte, sizeof(curbyte)))   return 1;curbyte = 0;curbit  = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}/** huffman_encode_file huffman encodes in to out.* 对FILE对象进行编码, 将*in编码后写入*out.*/
int
huffman_encode_file(FILE *in, FILE *out)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;/** Get the frequency of each symbol in the input file.* 将FILE中的数据写入到SymbolFrequencies中, 计算出了各个字符出现频率,* 也计算出了FILE对象内的总字符数.*/symbol_count = get_symbol_frequencies(&sf, in);/** Build an optimal table from the symbolCount.* 建树完毕, root就是sf[0], sf[1]=sf[2]=...sf[MAX_SYMBOLS]=NULL了*/se = calculate_huffman_codes( &sf );root = sf[0];/** Scan the file again and, using the table* previously built, encode it into the output file.*/rewind( in ); // 将文件指针重新指向一个流的开头/** 将编码写如文件流*/rc = write_code_table(out, se, symbol_count);if(rc == 0)rc = do_file_encode(in, out, se);/* Free the Huffman tree. */free_huffman_tree( root );free_encoder(se);return rc;
}/*** 对FILE进行解码*/
int
huffman_decode_file(FILE *in, FILE *out)
{huffman_node *root;huffman_node *p;int           c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);if( !root )   return 1;/* Decode the file. */p = root;while(data_count>0 && (c=fgetc(in))!=EOF){unsigned char byte = (unsigned char)c;unsigned char mask = 1;while(data_count > 0 && mask){p = ( (byte&mask)? p->one : p->zero );mask <<= 1;if( p->isLeaf ){fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree( root );return 0;
}// --------------------------------------------------------------------------------------
#define CACHE_SIZE 1024     /** 对memory编码需要考虑到数据源的速度和bufout的接收速度不匹配, cache* cache是为了连接不同速度的设备而设计的*/
/*** 对buffer进行编码**/
int huffman_encode_memory(const unsigned char    *bufin,unsigned int           bufinlen,unsigned char       **pbufout,unsigned int           *pbufoutlen)
{SymbolFrequencies    sf;SymbolEncoder     *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;  // memory中的字符个数buf_cache cache;         ///* Ensure the arguments are valid. 检测参数合法性 */if(!pbufout || !pbufoutlen) return 1;if( init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen) )   return 1;/** Get the frequency of each symbol in the input memory* 计算bufin内各个字符出现的频率, 并求得bufin内存储的字符个数.*/symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);/** Build an optimal table from the symbolCount.* 为每个Node编码, 如果这个Node的symbol为NULL, 则不编码了.*/se = calculate_huffman_codes( &sf );root = sf[0]; // root来拉, 哈哈, 逻辑树出来了./** Scan the memory again and, using the table* previously built, encode it into the output memory.* 将se内的数据统统的写入到cache中克.*/rc = write_code_table_to_memory(&cache, se, symbol_count);if(rc == 0){/** 为什么write_code_table_to_memory成功之后还要要执行一次do_memory_encode?**/rc = do_memory_encode(&cache, bufin, bufinlen, se);}/* Flush the cache. */flush_cache( &cache );/* Free the Huffman tree. */free_huffman_tree( root );free_encoder( se );free_cache( &cache );return rc;
}/*** 对bufin进行解码. 将解码后的数据写入到bufout中.*/
int huffman_decode_memory(const unsigned char    *bufin,unsigned int           bufinlen,unsigned char       **pbufout,unsigned int           *pbufoutlen)
{huffman_node *root, *p;unsigned int data_count;unsigned int i = 0;unsigned char *buf;unsigned int bufcur = 0;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen) return 1;/* Read the Huffman code table. */root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);if(!root)  return 1;buf = (unsigned char*)malloc(data_count);/* Decode the memory. */p = root;for(; i < bufinlen && data_count > 0; ++i){unsigned char byte = bufin[i];unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){buf[bufcur++] = p->symbol;p = root;--data_count;}}}free_huffman_tree(root);*pbufout = buf;*pbufoutlen = bufcur;return 0;
}/*
--------------------------------------------------------------------------------不妨假设待编码的buffer为 "ABCAADC"
手工分析可得该树的形状  :
当然也可以也可以将这个树沿y方向翻转180度root///  /A    *///  /C    *///  /B   D现在我们知道的两个事实是 :
这个buffer内的字符数为         : symbol_count    = 7  ( "ABCAADC"一个有7个字符 )
这个buffer内出现的字符种数为    : count       = 4 ( 只出现了ABCD四种字符 )接下来人工分析各个字符 :
symbol |   count  |       bits
--------|-----------|---------------------A     |     3     |   0000 0000B     |     1     |   0000 0110C     |     2     |   0000 0010D     |     1     |    0000 0111
我们设置左边为0, 右边为1. bits为从叶子节点走到root的路径.分析完毕后, 需要实现整个编码过程. 编码过程暂时跳过.
假设成功编码完毕, 需要把编码后的数据写入到bufout内.bufout内的
0-3个byte为字符种数 count
4-7个byte为字符个数 symbol_count然后是遍历SymbolEncoder, 依次对每种字符进行编码(我们这个例子只进行4次编码)
我们对每种字符都会进行编码, 每个字符编码后的输出不妨称为frame
那么这个frame是由三个部分组成的:(这个我们可以肯定一个char肯定是1byte的)
symbol (1byte)    -- 字符(这个我们可以肯定就算这个树根本没有分支, 永远只有左/右孩子, 那也了不起是是256的深度)
numbits (1byte)   -- 叶子走到root需要的步数bits   (1byte)    -- 叶子走到root的方式(即最终的编码, 比如说011)
开始我对这个bites到底占了多少个byte很是怀疑, 因为我不知道从叶子走到root
到底耗费了几步. 这里需要好好研究一下, 最好和最差情况. 暂时假设是个变化的byte吧.
但是有一点bites跟numbits是有关系的, 所以只要知道numbits还是可以知道bites占据了
多少byte的, 也知道bits到底是有几位.byte       content
---------------------------------------------0-3(4)     count4-7(4)     symbol_countA占xa个byte frame_structB占xb个byte frame_structC占xc个byte frame_structD占xd个byte frame_structX          X      这个X是do_file_encode函数写到bufout中去的数据, 那么这个数据是什么呢?
实际上它是循环的把出现的字符的bits写到bufout中,
根据这个数据,解码的时候就可以依次的找到第0,1,2...个位置出现的是什么字符了
--------------------------------------------------------------------------------
*/
Huffman编解码完全注释相关推荐

实验三 Huffman编解码算法实现与压缩效率分析
一.Huffman编解码原理 1. Huffman编码对原始文件进行Huffman编码,首先需要解决以下几点问题: 文件符号的概率分布情况是怎样的? Huffman树是如何建立的? 建立起Huffm ...
Huffman编解码
Huffman编解码算法实现与压缩效率分析一.背景知识及相关公式 1.信源熵信源熵是信息的度量单位,一般用H表示,单位是比特,对于任意一个随机变量,它的熵定义为,变量的不确定性越大,熵也就越大. ...
多媒体技术与应用之图像Huffman编解码
多媒体技术与应用之图像Huffman编解码一.实验内容 1.了解BMP图像的格式,实现BMP图片格式的数据域及文件头的分离 2.熟悉Huffman编码原理 3.使用Huffman编码算法对给定图像文 ...
数据压缩实验三 Huffman编解码算法实现与压缩效率分析
实验目的掌握Huffman编解码实现的数据结构和实现框架, 进一步熟练使用C编程语言, 并完成压缩效率的分析. 实验原理 1．本实验中Huffman编码算法 (1)将文件以ASCII字符流的形式读入 ...
huffman编解码算法实验与压缩效率分析
一.基本原理 1.huffman编码原理 huffman编码是一种无失真编码方式,是可变长(VLC)编码的一种. huffman编码基于信源的概率统计模型,基本思路是出现概率大的信源符号编长码,出现概 ...
实验三—Huffman编解码
一.实验原理 1.Huffman编码的步骤: (1)首先将所有字符发生的概率从小到大进行排序: (2)将最小的两个概率进行两两一合并,之后继续找最小的两个概率进行合并包括前面已经合并的和数: (3)一 ...
Huffman编解码实现文本压缩
编码方案设计: 实现最基本的哈夫曼编码,对文件进行两次扫描,第一次统计概率,第二次进行编码.压缩和解压缩时分别重新建立树,重新编码,以减小压缩后文件大小. 系统实现方案: typedef struct ...
数据压缩实验三--Huffman编解码及压缩率的比较
一,Huffman码 1 Huffman 编码 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman编码是可变字长编码(VLC)的一种. Huffman 编码基于信源 ...
数据压缩原理实验三 Huffman编解码算法实现与压缩效率分析
实验原理 Huffman编码是一种无失真编码方式,是一种可变长编码,它将出现概率大的信源符号短编码,出现概率小的信源符号长编码. 编码步骤: ①将文件以ASCII字符流的形式读入,统计每个符号的发生概 ...
Huffman编解码完全注释

Huffman编解码完全注释

Huffman编解码完全注释相关推荐

最新文章

热门文章