数据压缩原理 实验三 Huffman编解码算法实现与压缩效率分析
实验原理
Huffman编码是一种无失真编码方式,是一种可变长编码,它将出现概率大的信源符号短编码,出现概率小的信源符号长编码。
编码步骤:
①将文件以ASCII字符流的形式读入,统计每个符号的发生概率
②将所有文件中出现过的字符按照概率从小到大的顺序排列
③每一次选出最小的两个值,作为二叉树的两个子节点,将和作为他们的父节点,这两个子节点不再参与比较,新的父节点参与比较
④重复上一步,直到最后得到和为1的根节点
⑤将形成的二叉树的左节点标0,右节点标1,把从最上面的根节点到最下面的树叶节点途中遇到的0和1按序串联,即为该字符的编码表示
实验流程
代码分析
实验中将实际完成编码工作的工程Huff_code封装成一个静态链接库,由工程huff_run来调用,huff_run完成的工作包括解析命令行参数,打开、读取、关闭输入文件,打开关闭输出文件,调用Huff_code完成编码。
Huff_run
huffcode.c
#include "huffman.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <assert.h>#ifdef WIN32
#include <malloc.h>
extern int getopt(int, char**, char*);
extern char* optarg;
#else
#include <unistd.h>
#endifstatic int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);static void version(FILE *out)
{fputs("huffcode 0.3\n""Copyright (C) 2003 Douglas Ryan Richardson""; Gauss Interprise, Inc\n",out);
}static void usage(FILE* out)
{fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n""-i - input file (default is standard input)\n""-o - output file (default is standard output)\n""-d - decompress\n""-c - compress (default)\n""-m - read file into memory, compress, then write to file (not default)\n""-t - output huffman statistics\n",out);
}int main(int argc, char** argv)
{char memory = 0;char compress = 1;int opt;const char *file_in = NULL, *file_out = NULL;const char *file_out_table = NULL;FILE *in = stdin;FILE *out = stdout;FILE * outTable = NULL;/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对{switch(opt){case 'i':file_in = optarg;break;case 'o':file_out = optarg;break;case 'c':compress = 1;break;case 'd':compress = 0;break;case 'h':usage(stdout);return 0;case 'v':version(stdout);return 0;case 'm':memory = 1;break;case 't':file_out_table = optarg; break;default:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in){in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out){out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}if(file_out_table){outTable = fopen(file_out_table, "w");if(!outTable){fprintf(stderr,"Can't open output file '%s': %s\n",file_out_table, strerror(errno));return 1;}}if(memory){return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}if(compress)huffman_encode_file(in, out,outTable);elsehuffman_decode_file(in, out);if(in)fclose(in);if(out)fclose(out);if(outTable)fclose(outTable);return 0;
}static int memory_encode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Encode the memory. */if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}static int memory_decode_file(FILE *in, FILE *out)
{unsigned char *buf = NULL, *bufout = NULL;unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;assert(in && out);/* Read the file into memory. */while(!feof(in)){unsigned char *tmp;len += inc;tmp = (unsigned char*)realloc(buf, len);if(!tmp){if(buf)free(buf);return 1;}buf = tmp;cur += fread(buf + cur, 1, inc, in);}if(!buf)return 1;/* Decode the memory. */if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen)){free(buf);return 1;}free(buf);/* Write the memory to the file. */if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen){free(bufout);return 1;}free(bufout);return 0;
}
getopt.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>/* declarations to provide consistent linkage */
extern char *optarg;
extern int optind;
extern int opterr;int opterr = 1, /* if error message should be printed */optind = 1, /* index into parent argv vector */optopt, /* character checked for validity */optreset; /* reset getopt */
char *optarg; /* argument associated with option */#define BADCH (int)'?'
#define BADARG (int)':'
#define EMSG ""/** getopt --* Parse argc/argv argument vector.*/
int
getopt(int nargc, char * const *nargv, const char* ostr)
{static char *place = EMSG; /* option letter processing */char *oli; /* option letter list index */if (optreset || !*place) { /* update scanning pointer */optreset = 0;if (optind >= nargc || *(place = nargv[optind]) != '-') {place = EMSG;return (EOF);}if (place[1] && *++place == '-') { /* found "--" */++optind;place = EMSG;return (EOF);}} /* option letter okay? */if ((optopt = (int)*place++) == (int)':' ||!(oli = strchr(ostr, optopt))) {/** if the user didn't specify '-' as an option,* assume it means EOF.*/if (optopt == (int)'-')return (EOF);if (!*place)++optind;if (opterr && *ostr != ':')(void)fprintf(stderr,"%s: illegal option -- %c\n", __FILE__, optopt);return (BADCH);}if (*++oli != ':') { /* don't need argument */optarg = NULL;if (!*place)++optind;}else { /* need an argument */if (*place) /* no white space */optarg = place;else if (nargc <= ++optind) { /* no arg */place = EMSG;if (*ostr == ':')return (BADARG);if (opterr)(void)fprintf(stderr,"%s: option requires an argument -- %c\n",__FILE__, optopt);return (BADCH);}else /* white space */optarg = nargv[optind];place = EMSG;++optind;}return (optopt); /* dump back option letter */
}
Huff_code
huffman.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"#ifdef WIN32
#include <winsock2.h>
#include <malloc.h>
#define alloca _alloca
#else
#include <netinet/in.h>
#endif//////////定义节点结构体//////////
typedef struct huffman_node_tag
{unsigned char isLeaf; //是否为树叶节点unsigned long count; //信源中出现的频数struct huffman_node_tag *parent; //定义父结点指针union //如果不是树叶节点,该项为该节点左右子结点的指针{ //如果是树叶节点,该项为某个信源符号struct{struct huffman_node_tag *zero, *one;};unsigned char symbol;};
} huffman_node;//////////定义码字结构体//////////
typedef struct huffman_code_tag
{/* The length of this code in bits. */unsigned long numbits; //码字的长度(单位bit)//码字的第一位存于bits[0]的第一位//码字的第二位存于bits[0]的第二位//码字的第八位存于bits[0]的第八位//码字的第九位存于bits[1]的第一位unsigned char *bits; //码字
} huffman_code;//////////定义输出信息结构体//////////
typedef struct huffman_statistics_result
{float freq[256]; //256个ASCII码各自出现的频率unsigned long numbits[256]; //每个码的位数unsigned char bits[256][100]; //假设256个码长不超过100的码字
}huffman_stat;//////////将不足一字节的内容凑成一字节//////////
static unsigned long numbytes_from_numbits(unsigned long numbits)
{return numbits / 8 + (numbits % 8 ? 1 : 0); //码长位数变为字节数
}//////////取出码字中一位//////////
static unsigned char get_bit(unsigned char* bits, unsigned long i)
{return (bits[i / 8] >> i % 8) & 1; //取出第i(从0开始)位 先取余再右移
}//////////将编好的码字反序//////////
static void reverse_bits(unsigned char* bits, unsigned long numbits)
{unsigned long numbytes = numbytes_from_numbits(numbits); //位数变字节unsigned char *tmp =(unsigned char*)alloca(numbytes); //开辟空间unsigned long curbit;long curbyte = 0;memset(tmp, 0, numbytes); //把numbytes字节的tmp全置成0for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8; //当前字节当前位if(curbit > 0 && curbit % 8 == 0)++curbyte;tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos); //从码字的最后一位开始取 移到当前位 与0或}memcpy(bits, tmp, numbytes); //从源src所指的内存地址的起始位置开始拷贝n个字节到目标dest所指的内存地址的起始位置中
}//////////为树叶代表的信源符号编码//////////
static huffman_code*new_code(const huffman_node*leaf)
{unsigned long numbits = 0; //码长unsigned char* bits = NULL; //码字首地址huffman_code *p; //定义指向码字结构体的指针while(leaf && leaf->parent) //leaf!=0表示当前字符存在{ //leaf->parent!=0表示当前在字符未编码完成 因为根节点没有父节点huffman_node *parent = leaf->parent;unsigned char cur_bit = (unsigned char)(numbits % 8); //所编位在当前byte中的位置unsigned long cur_byte = numbits / 8; //当前是第几个byte//realloc这里很关键,它与malloc不同//它在保持原有的数据不变的情况下重新分配空间//原有数据存在新空间中的前面部分//(这里空间的地址可能有变化)if(cur_bit == 0){size_t newSize = cur_byte + 1;bits = (unsigned char*)realloc(bits, newSize);bits[newSize - 1] = 0; //Initialize the new byte. 初始化新分配的8bit为0}if(leaf == parent->one) //如果是右子节点(若是左子节点因为初始化bits是0所以不用编)bits[cur_byte] |= 1 << cur_bit; //左移1至当前byte的当前位(特编位)++numbits; //码字位数加一leaf = parent; //把父节点作为下一个待编的 该编码过程是从树叶到树根的}if(bits)reverse_bits(bits, numbits); //整个码字逆序p = (huffman_code*)malloc(sizeof(huffman_code));p->numbits = numbits; //为码字结构体赋值p->bits = bits; //整数个字节,与numbits配合才可得到真正的码字return p;
}#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];//256个huffman_code的指针//位置上对应于ASCII的顺序,用于保存码表//////////建立叶结点 参数为该树叶代表的信源符号//////////
static huffman_node*new_leaf_node(unsigned char symbol)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); //开辟指向一个节点的指针p->isLeaf = 1; //是树叶p->symbol = symbol; //赋符号p->count = 0; //该节点的代表的频率p->parent = 0; //父节点初始化为0return p; //返回一个已经初始化的叶节点
}//////////新建中间节点 参数为该节点代表的频率及左右子节点//////////
static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); //开辟指向一个节点空间的指针p->isLeaf = 0; //不是树叶p->count = count; //概率赋值p->zero = zero; //左子节点赋值p->one = one; //右子节点赋值p->parent = 0; //父节点初始化为0return p;
}//////////释放Huffman码树 参数为一个节点//////////
static void free_huffman_tree(huffman_node *subtree)
{if(subtree == NULL) //是否到了root,是则说明编码结束,returnreturn;if(!subtree->isLeaf) //对于非树叶节点递归调用该函数左右子节点{free_huffman_tree(subtree->zero);free_huffman_tree(subtree->one);}free(subtree);
}//////////释放Huffman码 参数为一个码指针//////////
static void free_code(huffman_code* p)
{free(p->bits);free(p);
}//////////释放指向256个节点的指针数组//////////
static void free_encoder(SymbolEncoder *pSE)
{unsigned long i;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*pSE)[i]; //把每一个指针数组指向的每一个节点赋给节点指针pif(p)free_code(p); //通过释放256个p来释放数组}free(pSE);
}//////////初始化一个含有256节点指针的数组//////////
static void init_frequencies(SymbolFrequencies *pSF)
{ memset(*pSF, 0, sizeof(SymbolFrequencies)); //全部初始化为0
}typedef struct buf_cache_tag
{unsigned char *cache;unsigned int cache_len;unsigned int cache_cur;unsigned char **pbufout;unsigned int *pbufoutlen;
} buf_cache;static int init_cache(buf_cache* pc,unsigned int cache_size,unsigned char **pbufout,unsigned int *pbufoutlen)
{assert(pc && pbufout && pbufoutlen);if(!pbufout || !pbufoutlen)return 1;pc->cache = (unsigned char*)malloc(cache_size);pc->cache_len = cache_size;pc->cache_cur = 0;pc->pbufout = pbufout;*pbufout = NULL;pc->pbufoutlen = pbufoutlen;*pbufoutlen = 0;return pc->cache ? 0 : 1;
}static void free_cache(buf_cache* pc)
{assert(pc);if(pc->cache){free(pc->cache);pc->cache = NULL;}
}static int flush_cache(buf_cache* pc)
{assert(pc);if(pc->cache_cur > 0){unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;unsigned char* tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;pc->cache_cur = 0;}return 0;
}static int write_cache(buf_cache* pc,const void *to_write,unsigned int to_write_len)
{unsigned char* tmp;assert(pc && to_write);assert(pc->cache_len >= pc->cache_cur);if(to_write_len > pc->cache_len - pc->cache_cur){unsigned int newlen;flush_cache(pc);newlen = *pc->pbufoutlen + to_write_len;tmp = realloc(*pc->pbufout, newlen);if(!tmp)return 1;memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);*pc->pbufout = tmp;*pc->pbufoutlen = newlen;}else{/* Write the data to the cache. */memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);pc->cache_cur += to_write_len;}return 0;
}//////////第一次扫描,统计信源字符发生频率//////////
static unsigned int get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{ int c;unsigned int total_count = 0; //总信源符号初始化为0/////设置所有频率为0/////init_frequencies(pSF); //将所有信源符号地址初始化为NULL(0)/////统计输入文件里每个符号出现的频率/////while((c = fgetc(in)) != EOF) //第一遍扫描文件{unsigned char uc = c;if(!(*pSF)[uc]) //如果是一个新符号,则产生该字符的一个新叶节点(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count; //当前字符出现的频数加一++total_count; //总信源符号数加一}return total_count;
}static unsigned int get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,const unsigned char *bufin,unsigned int bufinlen)
//从缓存中获得信源符号概率 把上段代码作为信源来源的输入文件换成了bufin
{unsigned int i;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;++total_count;}return total_count;
}//////////定义排序的标准 按符号出现次数升序排列//////////
static int SFComp(const void *p1, const void *p2)
{/////强制转换p1、p2为霍夫曼节点的二维指针 并将第一行指针赋给hn1、hn2const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;/* Sort all NULLs to the end. */if(hn1 == NULL && hn2 == NULL)return 0;if(hn1 == NULL)return 1;if(hn2 == NULL)return -1;if(hn1->count > hn2->count)return 1;else if(hn1->count < hn2->count)return -1;return 0;
}#if 1
//////////打印256个信源及其出现次数//////////
static void print_freqs(SymbolFrequencies * pSF)
{size_t i;for(i = 0; i < MAX_SYMBOLS; ++i){if((*pSF)[i])printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);elseprintf("NULL\n");}
}
#endif//////////为树叶编码,输入树根遍历码数找到树叶进行编码//////////
static void build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)//256个huffman_code的指针,位置上对应于 ASCII的顺序,用于保存码表
{if(subtree == NULL) //是否已经到了root,是则说明编码结束,returnreturn;if(subtree->isLeaf) //是叶结点则产生码字(*pSF)[subtree->symbol] = new_code(subtree);else{build_symbol_encoder(subtree->zero, pSF); //递归,中序遍历build_symbol_encoder(subtree->one, pSF);}
}//////////生成码树并编码//////////
static SymbolEncoder* calculate_huffman_codes(SymbolFrequencies * pSF) //按频率从小到大顺序排序并建立Huffman树
{unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;//初始化两个用来排序的节点结构体SymbolEncoder *pSE = NULL; //初始化一个码字结构体指针#if 1 //排序前,按数组顺序打印每片树叶代表的信源符号和出现的次数printf("BEFORE SORT\n");print_freqs(pSF); //演示堆栈的使用
#endif/////按信源符号出现频率大小排序,小概率符号在前(pSF数组中),下标较小/////qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp); //讲解SFComp函数的作用,断点在调试程序里的作用#if 1 //排序后,按数组顺序打印每片树叶代表的信源符号和出现的次数 printf("AFTER SORT\n");print_freqs(pSF); //对树叶按概率排序后再次打印其符号和次数
#endif/////得到当前待编码文件中所出现的信源符号的种类数/////for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n); //计算非空节点数n/////循环n-1次,生成中间节点并建立节点间的相互关系/////for(i = 0; i < n - 1; ++i)//因为二叉树中度为0的节点比度为2的节点多1个,霍夫曼码树的非树叶节点度均为2,因此生成非树叶节点的个数为n-1{/* Set m1 and m2 to the two subsets of least probability. */m1 = (*pSF)[0];m2 = (*pSF)[1];/* Replace m1 and m2 with a set {m1, m2} whose probability* is the sum of that of m1 and m2. */(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);//将出现序列中出现次数最少的两个符号的次数和、左右子节点作为参数建立新的非树叶节点赋给这两个节点的父节点指针,并将此节点作为节点指针数组新的0号元素(*pSF)[1] = NULL; //已经加过的节点置空/* Put newSet into the correct count position in pSF. */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp); //加入新的节点,再次排序}/* Build the SymbolEncoder array from the tree. */ //由建立的huffman树对计算每个符号的码字pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder)); //为存放256个码字开辟空间memset(pSE, 0, sizeof(SymbolEncoder)); //将pse先全部置零build_symbol_encoder((*pSF)[0], pSE);//(*pSF)[0]是根节点 pSE是一个全为零的码字指针数组[256] 通过根节点能找到整棵树 并从树叶开始向上编码 这是关键的语句 有递归遍历 和编码 如果出错 可以进入里面看具体哪一步出错return pSE;
}//////////把码表写到输出文件//////////
static int write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)//写码表
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count; //根据非空码字结构体的多少来计算有多少个码字}/* Write the number of entries in network byte order. */i = htonl(count); //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D ,//因此big-endian作为network byte order,little-endian作为host byte order。//little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变if(fwrite(&i, sizeof(i), 1, out) != 1) //码字总数写入输出文件return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count); //文件字节数if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) //文件字节数写入输出文件return 1;/////将Huffman码表写入文件/////for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* Write the 1 byte symbol. */fputc((unsigned char)i, out); //写符号的ASCII码十进制/* Write the 1 byte code bit length. */fputc(p->numbits, out); //写码长/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits); //位数变字节if(fwrite(p->bits, 1, numbytes, out) != numbytes)//一次写一个字符的码字进去,因为长度设置的是该码字的字节数,fwrite的返回值为实际写入的数据项个数numbytes,所以当p不为空时,就会循环写入,return 1这条语句正常情况下永远不会执行,因此成功写入最后会return 0return 1;}}return 0;
}static int write_code_table_to_memory(buf_cache *pc,SymbolEncoder *se,unsigned int symbol_count)
{unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i){if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);if(write_cache(pc, &i, sizeof(i)))return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(write_cache(pc, &symbol_count, sizeof(symbol_count)))return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* The value of i is < MAX_SYMBOLS (256), so it canbe stored in an unsigned char. */unsigned char uc = (unsigned char)i;/* Write the 1 byte symbol. */if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the 1 byte code bit length. */uc = (unsigned char)p->numbits;if(write_cache(pc, &uc, sizeof(uc)))return 1;/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);if(write_cache(pc, p->bits, numbytes))return 1;}}return 0;
}//////////读取码表并重建据此Huffman树//////////
static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1)//得到码表中的符号数{free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0)//检查是否仍有叶节点未建立,每循环一次建立起一条由根节点至叶节点(符号)的路径{int c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}symbol = (unsigned char)c; //符号if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}numbits = (unsigned char)c; //码长numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes); //为读取码字分配空间if(fread(bytes, 1, numbytes, in) != numbytes) //读取码字{free(bytes);free_huffman_tree(root);return NULL;}for(curbit = 0; curbit < numbits; ++curbit)//读取当前码字的每一位//并依据读取的结果逐步建立起由根节点至该符号叶节点的路径{if(get_bit(bytes, curbit)) //当前读取位是否为‘1’{ //当前读取位为‘1’if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)//是否是当前码字的最后一位//是,则新建叶节点//不是,则新建非叶节点? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p; //‘1’的一枝的父节点指向当前节点}p = p->one; //沿‘1’方向下移一级}else{ //当前读取位为‘0’if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root; //返回Huffman树的根节点
}static int memread(const unsigned char* buf,unsigned int buflen,unsigned int *pindex,void* bufout,unsigned int readlen)
{assert(buf && pindex && bufout);assert(buflen >= *pindex);if(buflen < *pindex)return 1;if(readlen + *pindex >= buflen)return 1;memcpy(bufout, buf + *pindex, readlen);*pindex += readlen;return 0;
}static huffman_node*read_code_table_from_memory(const unsigned char* bufin,unsigned int bufinlen,unsigned int *pindex,unsigned int *pDataBytes)
{huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.(it is stored in network byte order). */if(memread(bufin, bufinlen, pindex, &count, sizeof(count))){free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0){unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))){free_huffman_tree(root);return NULL;}if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))){free_huffman_tree(root);return NULL;}numbytes = (unsigned char)numbytes_from_numbits(numbits);bytes = (unsigned char*)malloc(numbytes);if(memread(bufin, bufinlen, pindex, bytes, numbytes)){free(bytes);free_huffman_tree(root);return NULL;}for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit)){if(p->one == NULL){p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;
}//////////把编码后的数据写入输出文件//////////
static int do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{unsigned char curbyte = 0; //当前字节的码字unsigned char curbit = 0; //当前字节的当前位int c;while((c = fgetc(in)) != EOF) //遍历文件的每一个字符(/字节){unsigned char uc = (unsigned char)c;huffman_code *code = (*se)[uc]; //查表unsigned long i;for(i = 0; i < code->numbits; ++i) //将码字写入文件{/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit; //取码字/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8)//将取出的码字以字节为单位写入,当前位不是一字节时,一样执行curbit加1操作,让下一次循环取码字能左移到下一位{fputc(curbyte, out); //码字写入输出文件curbyte = 0; //码字置零curbit = 0; //当前位置零} }}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/if(curbit > 0)//当剩余未写入码字不够一字节,不能通过上面的循环中if写入,所以再补充一句,如果还有剩余比特未被写入,就继续写入输出文件fputc(curbyte, out);return 0;
}static int do_memory_encode(buf_cache *pc,const unsigned char* bufin,unsigned int bufinlen,SymbolEncoder *se)
{unsigned char curbyte = 0;unsigned char curbit = 0;unsigned int i;for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;/* If this byte is filled up then write it* out and reset the curbit and curbyte. */if(++curbit == 8){if(write_cache(pc, &curbyte, sizeof(curbyte)))return 1;curbyte = 0;curbit = 0;}}}/** If there is data in curbyte that has not been* output yet, which means that the last encoded* character did not fall on a byte boundary,* then output it.*/return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}//////////写存储统计信息的结构体 存储频率和符号数//////////
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{int i,count =0;for(i = 0; i < MAX_SYMBOLS; ++i){ if((*SF)[i]){st->freq[i]=(float)(*SF)[i]->count/total_count; //计算每个符号的频率并赋值到结果表中count+=(*SF)[i]->count; //计算信源符号总数} else {st->freq[i]= 0; //没有出现过的信源符号频率为0}}if(count==total_count)return 1;elsereturn 0;
}//////////写存储统计信息的结构体 存储码字和码长//////////
int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{unsigned long i,j;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;st->numbits[i] = p->numbits; //把码字结构体里面的码字位数赋给输出信息的码字位数numbytes = numbytes_from_numbits(p->numbits); //位数变字节,用于下面索引到正确的码字for (j=0;j<numbytes;j++)st->bits[i][j] = p->bits[j];//把码字结构体里面的码字赋给输出信息的码字}elsest->numbits[i] =0; //如果该码字结构体为空,则该符号没有在文件中出现,没有编码}return 0;
}//////////输出统计信息表文件//////////
void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{int i,j;unsigned char c;fprintf(out_Table,"symbol\t freq\t codelength\t code\n"); //在输出文件中打印表头for(i = 0; i < MAX_SYMBOLS; ++i){ fprintf(out_Table,"%d\t ",i); //输出 符号的ASCII码十进制表示fprintf(out_Table,"%f\t ",st->freq[i]); //输出 符号在输入文件中出现的频率fprintf(out_Table,"%d\t ",st->numbits[i]); //输出 符号码字的码长if(st->numbits[i]) //码长不为0 就输出码字{for(j = 0; j < st->numbits[i]; ++j) //循环取码字的每一位,从高到低输出到文件中{c =get_bit(st->bits[i], j);fprintf(out_Table,"%d",c);}}fprintf(out_Table,"\n");}
}//////////进行Huffman编码//////////
int huffman_encode_file(FILE *in, FILE *out, FILE *out_Table) //Huffman编码,增加一个FILE *out_Table,用于输出表格
{SymbolFrequencies sf; //含有256个节点的数组SymbolEncoder *se; //指向256个编码的指针huffman_node *root = NULL; //根节点int rc;unsigned int symbol_count; //文件中总ASCII码数huffman_stat hs; //输出结果的表 包括符号频率 码长 码字等/////获取输入文件的每个符号的出现概率/////symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成//sf中每个节点所代表的信源符号出现的次数count已经被赋值 huffST_getSymFrequencies(&sf,&hs,symbol_count);/////从符号统计来建立一个最理想的表格来/////se = calculate_huffman_codes(&sf); //编码 256个节点传入得到256个码字root = sf[0]; //根节点huffST_getcodeword(se, &hs); //为输出信息赋值output_huffman_statistics(&hs,out_Table); //输出信息/////再次扫描文件,用预先建立的表格把它编成输出文件/////rewind(in); //将输入文件的内部指针重新指向文件开头rc = write_code_table(out, se, symbol_count); //写码表if(rc == 0) //成功写入码表后,rc就被赋值为0rc = do_file_encode(in, out, se); //写编码后的文件,返回值为0/////释放Huffman码树/////free_huffman_tree(root); //释放码树free_encoder(se); //释放码字结构体return rc;
}//////////读取Huffman码字,并解码输出//////////
int huffman_decode_file(FILE *in, FILE *out)
{huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);if(!root)return 1; //Huffman树建立失败/* Decode the file. */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF)//data_count>0:逻辑上仍有数据;(c=fgetc(in)!=EOF):文件中仍有数据{unsigned char byte = (unsigned char)c; //1byte的码字unsigned char mask = 1; //mask用于逐位读出码字while(data_count > 0 && mask) //loop9:mask=0x00000000,跳出循环{p = byte & mask ? p->one : p->zero; //沿Huffman树前进mask <<= 1; //loop1:byte&0x00000001//loop2:byte&0x00000010//……//loop8:byte&0x10000000if(p->isLeaf) //至叶节点(解码完毕){fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree(root); //所有Huffman码字均已解码输出,文件解码完毕return 0;
}#define CACHE_SIZE 1024int huffman_encode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;buf_cache cache;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))return 1;/* Get the frequency of each symbol in the input memory. */symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);root = sf[0];/* Scan the memory again and, using the tablepreviously built, encode it into the output memory. */rc = write_code_table_to_memory(&cache, se, symbol_count);if(rc == 0)rc = do_memory_encode(&cache, bufin, bufinlen, se);/* Flush the cache. */flush_cache(&cache);/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);free_cache(&cache);return rc;
}int huffman_decode_memory(const unsigned char *bufin,unsigned int bufinlen,unsigned char **pbufout,unsigned int *pbufoutlen)
{huffman_node *root, *p;unsigned int data_count;unsigned int i = 0;unsigned char *buf;unsigned int bufcur = 0;/* Ensure the arguments are valid. */if(!pbufout || !pbufoutlen)return 1;/* Read the Huffman code table. */root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);if(!root)return 1;buf = (unsigned char*)malloc(data_count);/* Decode the memory. */p = root;for(; i < bufinlen && data_count > 0; ++i) {unsigned char byte = bufin[i];unsigned char mask = 1;while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;if(p->isLeaf){buf[bufcur++] = p->symbol;p = root;--data_count;}}}free_huffman_tree(root);*pbufout = buf;*pbufoutlen = bufcur;return 0;
}
实验结果与总结
实验选取了十种文件进行编码,并对编码后输出的excel文件进行了分析。
其中:
p*code列为符号出现概率与码长之积
-p*log(p)列为符号的自信息与出现概率之积*
不同种类文件符号概率分布不同
整理得出
文件的平均码长与信源熵大小相近。
实验中YUV信源符号出现概率分布变化较大,其余文件信源符号接近等概分布,而BMP文件压缩效率较高。
数据压缩原理 实验三 Huffman编解码算法实现与压缩效率分析相关推荐
- 数据压缩 实验三 Huffman编解码算法实现与压缩效率分析
实验目的 掌握Huffman编解码实现的数据结构和实现框架, 进一步熟练使用C编程语言, 并完成压缩效率的分析. 实验原理 1.本实验中Huffman编码算法 (1)将文件以ASCII字符流的形式读入 ...
- 实验三 Huffman编解码算法实现与压缩效率分析
一.Huffman编解码原理 1. Huffman编码 对原始文件进行Huffman编码,首先需要解决以下几点问题: 文件符号的概率分布情况是怎样的? Huffman树是如何建立的? 建立起Huffm ...
- Huffman 编解码算法实现与压缩效率分析
一.实验原理 1 熵,又称为"信息熵" (Entropy) 1.1 在信息论中,熵是信息的度量单位.信息论的创始人 Shannon 在其著作<通信的 数学理论>中提出了 ...
- 实验三 LZW编解码算法实现与分析
LZW简述 本部分参考wiki https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch LZW压缩算法在1978年提出,由 Abr ...
- 实验三—Huffman编解码
一.实验原理 1.Huffman编码的步骤: (1)首先将所有字符发生的概率从小到大进行排序: (2)将最小的两个概率进行两两一合并,之后继续找最小的两个概率进行合并包括前面已经合并的和数: (3)一 ...
- [实验三]LZW 编解码算法实现与分析
目录 一.LZW算法 1.1 编码步骤 1.2 解码步骤 1.3 关于有可能出现当前码字CW不在词典中的情况说明 二.代码实现 2.1 程序说明 2.2 数据结构 2.3 bitio.h 2.4 bi ...
- 数据压缩实验三--Huffman编解码及压缩率的比较
一,Huffman码 1 Huffman 编码 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman编码是可变字长编码(VLC)的一种. Huffman 编码基于信源 ...
- huffman编解码算法实验与压缩效率分析
一.基本原理 1.huffman编码原理 huffman编码是一种无失真编码方式,是可变长(VLC)编码的一种. huffman编码基于信源的概率统计模型,基本思路是出现概率大的信源符号编长码,出现概 ...
- 实验三 LZW编解码实验
一.LZW算法简介 LZW为词典编码的一种,是通过从输入数据中创建"短语词典".在编码过程中遇到词典中出现的"短语"时,编码器就输出其对应的"序号&q ...
最新文章
- 牛客华为机试第5题python
- k-means-algorithm
- 介绍map.entry接口
- 用python画蝴蝶_图形化编程经验分享,画笔基础,软件包括Python、Kittenblock
- Python 之 风格规范(Google )
- 7 useLayoutEffect、useDebugValue
- token拦截器android_vue.js添加拦截器,实现token认证(使用axios)
- 作者:季统凯(1972-),男,博士,中国科学院云计算产业技术创新与育成中心研究员...
- javamail 超时_为什么JavaMail连接超时太长
- [ABP开源项目]--vue+vuex+vue-router+EF的权限管理系统
- 伸缩Kubernetes到2500个节点中遇到的问题和解决方法
- 速达5000进销存PDA条码打印扫码开单-吉度PDA定制
- python可以做手机脚本吗,如何将Python自动化测试脚本放在手机上运行
- Restful API详解
- 明朝取代元朝鲜为人知的秘密
- java 为文件赋权,linux 系统给文件赋权命令大全
- 计算机工资表2017,2017事业单位工资标准表
- 华为杯山东理工大学第二届团体程序设计天梯赛
- Qt边框border概述
- 时间分治(cdq分治)
热门文章
- eclipse 导出linux rcp,导出一个RCP程序(Exporting an RCP Application)
- 工业数据分析技术与实战之数据分析的误区——昆仑数据田春华培训听课记录
- ubuntu下搭建apache web服务器,运行cgi配置
- 教你更高效的管理医院设备,可别错过
- 【原创】线性代数学习笔记——剑桥食谱
- Apache 的 httpd.conf 详解 【转】
- VH-HFCN based Parking Slot and Lane Markings Segmentation on Panoramic Surround View
- 金融行业数据该如何防泄露
- 使用 Skywalking 实现全链路监控
- 【软件工程】用户界面设计