python 实现的huffman 编码压缩,解码解压缩
压缩效果
使用本程序对《平凡的世界》做压缩测试,压缩前为文本文件,大小为1.7M,压缩后为二进制文件,大小接近1M(988,817byte),而zip压缩后体积为920,997byte,比zip差,压缩文件存储格式待改善。另外,因为从Huffman压缩算法的原理可知,该算法对字符重复率高的文本最有效,比如长篇小说或者英文小说。
l 略大文件
test3.txt 《平凡的世界》
压缩前:1.62M
压缩后:1.39M
压缩率:86%
压缩时间14.23秒
解压时间 16.85秒
测试结果:压缩,解压成功!
压缩解压时间在可接受范围之内
2 Create a huffman tree from
3 the input is a list like
4 [('a',3), ('b',2)]
5 frequnce of 'a' appeard is stored as it's weight
6 '''
7 from Queue import PriorityQueue
8 #if do not use treeWiter so not include pygraphviz than can use py3.0
9 from treeWriter import TreeWriter
10 from copy import copy
11
12 class NodeBase():
13 def __init__(self):
14 self.weight = 0
15
16 def elem(self):
17 return self.weight
18
19 class Node(NodeBase):
20 def __init__(self, weight = 0, left = None, right = None):
21 self.weight = weight
22 self.left = left
23 self.right = right
24
25 def __str__(self):
26 return str(self.weight)
27
28 class Leaf(NodeBase):
29 def __init__(self, key = '', weight = 0):
30 self.key = key
31 self.weight = weight
32
33 def __str__(self):
34 return str(self.key)
35
36
37 def convert(c):
38 '''
39 input c = 'a' ord(a) = 97
40 bin(97) = '0b1100001'
41 return ['0', '1', '1', '0', '0', '0', '0', '1']
42 '''
43 l1 = list(bin(ord(c))) #like 0b11101
44 l2 = ['0'] * (10 - len(l1))
45 l2.extend(l1[2:])
46 return l2
47
48 class HuffmanTree():
49 '''
50 base class for HuffmanTreeForCompress and HuffmanTreeForDecompress
51 '''
52 def __init__(self):
53 self.root = None
54
55 class HuffmanTreeForCompress(HuffmanTree):
56 '''
57 create a huffman tree for the compressing process
58 here self.list like [('a',3),('b',4)] where 'a' is key, 3 is weight
59 or say frequence of 'a' appear in the text
60 '''
61 def __init__(self, list):
62 HuffmanTree.__init__(self)
63 self.list = list #like [('a',3),('b',4)]
64 self.dict = {} #like {'a':[0,1,1,0] , .}
65
66 self.__buildTree()
67 self.__genEncode()
68
69 def __initPriorityQueue(self, queue):
70 '''
71 init priority queue let lowest weight at top
72 '''
73 for key, weight in self.list:
74 leaf = Leaf(key, weight)
75 queue.put((weight,leaf))
76
77 def __buildTree(self):
78 '''
79 build the huffman tree from the list of weight using prority queue
80 greedy alogrithm,choose two least frequence node first
81 '''
82 length = len(self.list)
83 queue = PriorityQueue(length)
84 self.__initPriorityQueue(queue)
85 #while queue.qsize() > 1:
86 # do len(self.list) - 1 times same as while queue.qsize() > 1
87 for i in range(length - 1):
88 left = queue.get()[1]
89 right = queue.get()[1]
90 weight = left.weight + right.weight
91 node = Node(weight, left, right)
92 queue.put((weight,node))
93 self.root = queue.get()[1]
94
95 def __genEncode(self):
96 '''
97 get huffman encode for each key using depth first travel of tree
98 '''
99 def genEncodeHelp(root, encode = []):
100 if isinstance(root, Leaf):
101 #TODO notice need copy content here,why can't list(encode)?
102 self.dict[root.key] = copy(encode)
103 #print self.dict[root.key]
104 return
105 encode.append(0)
106 genEncodeHelp(root.left, encode)
107 encode[len(encode) - 1] = 1
108 genEncodeHelp(root.right, encode)
109 encode.pop()
110 genEncodeHelp(self.root)
111
112
113 class HuffmanTreeForDecompress(HuffmanTree):
114 '''
115 rebuild of huffman tree for the decompressing process
116 '''
117 def __init__(self, infile):
118 HuffmanTree.__init__(self)
119 self.__buildTree(infile)
120
121 def __buildTree(self, infile):
122 def buildTreeHelp(infile):
123 first = infile.read(1)
124 second = infile.read(1)
125 #if not (first == '\xff' and second == '\xfe'): #is leaf
126 if first == '\x00': #is leaf, not consider unicode now
127 return Leaf(second)
128 node = Node()
129 node.left = buildTreeHelp(infile)
130 node.right = buildTreeHelp(infile)
131 return node
132 infile.read(2)
133 self.root = Node()
134 self.root.left = buildTreeHelp(infile)
135 self.root.right = buildTreeHelp(infile)
136
137 class Decompress():
138 def __init__(self, infileName, outfileName = ''):
139 #TODO better name, expection of opening file
140 self.infile = open(infileName, 'rb')
141 if outfileName == '':
142 outfileName = infileName + '.de'
143 self.outfile = open(outfileName, 'wb')
144 self.tree = None
145
146 def __del__(self):
147 self.infile.close()
148 self.outfile.close()
149
150 def decompress(self):
151 self.__rebuildHuffmanTree()
152 self.__decodeFile()
153
154 def __rebuildHuffmanTree(self):
155 self.infile.seek(0)
156 self.tree = HuffmanTreeForDecompress(self.infile)
157 #HuffmanTreeWriter(self.tree).write('tree2.png') #for debug
158
159 def __decodeFile(self):
160 #right now do not consier speed up using table
161 #do not consider the last byte since it's wrong right now
162
163 #TODO use a table as 0x00 -> 0000 0000 will speed up?
164 self.outfile.seek(0)
165 leftBit = ord(self.infile.read(1))
166 lastByte = self.infile.read(1) #it is the last byte if leftBit != 0
167 curNode = self.tree.root
168 #import gc
169 #gc.disable()
170 while 1:
171 c = self.infile.read(1) #how about Chinese caracter? 2 bytes?
172 if c == '':
173 break
174 li = convert(c) #in c++ you can not return refernce to local in func here ok? yes
175 for x in li:
176 if x == '0':
177 curNode = curNode.left
178 else:
179 curNode = curNode.right
180 if isinstance(curNode, Leaf): #the cost of isinstance is higer than lkie root.left == None ?
181 self.outfile.write(curNode.key)
182 curNode = self.tree.root
183
184
185 #deal with the last bye if leftBit != 0
186 #TODO notcice code repeate can we improve?
187 if leftBit:
188 li = convert(lastByte)
189 for x in li:
190 if x == '0':
191 curNode = curNode.left
192 else:
193 curNode = curNode.right
194 if isinstance(curNode, Leaf): #the cost of isinstance is higer than lkie root.left == None ?
195 self.outfile.write(curNode.key)
196 curNode = self.tree.root
197 break #for the last byte if we find one than it's over,the other bits are useless
198
199 self.outfile.flush()
200 #gc.enable()
201
202
203
204 class Compress():
205 def __init__(self, infileName, outfileName = ''):
206 self.infile = open(infileName, 'rb')
207 if outfileName == '':
208 outfileName = infileName + '.compress'
209 self.outfile = open(outfileName, 'wb')
210 self.dict = {}
211 self.tree = None
212
213 def __del__(self):
214 self.infile.close()
215 self.outfile.close()
216
217 def compress(self):
218 self.__caculateFrequence()
219 self.__createHuffmanTree()
220 self.__writeCompressedFile()
221
222 def __caculateFrequence(self):
223 '''
224 The first time of reading the input file and caculate each
225 character frequence store in self.dict
226 '''
227 self.infile.seek(0)
228 while 1:
229 c = self.infile.read(1) #how about Chinese caracter? 2 bytes?
230 if c == '':
231 break
232 #print c
233 if c in self.dict:
234 self.dict[c] += 1
235 else:
236 self.dict[c] = 0
237
238 def __createHuffmanTree(self):
239 '''
240 Build a huffman tree from self.dict.items()
241 '''
242 #TODO for py 3.0 need list(self.dict.items()) instead
243 self.tree = HuffmanTreeForCompress(list(self.dict.items()))
244 #HuffmanTreeWriter(self.tree).write('tree1.png') #for debug
245
246 def __writeCompressedFile(self):
247 '''
248 Create the compressed file
249 First write the huffman tree to the head of outfile
250 than translate the input file with encode and write the result to
251 outfile
252 '''
253 self.outfile.seek(0)
254 self.__serializeTree()
255 self.__encodeFile()
256
257 def __serializeTree(self):
258 '''
259 In order to write the tree like node node leaf node .
260 in pre order sequence to the compressed file head
261 here will return the sequence list
262 TODO reuse pre order and using decorator technic!!
263 list like [(0,0), (0,0), (1,'c')],
264 (0,0) the first 0 means internal node
265 (1,'c') the first 1 means leaf and 'c' is the key
266 '''
267 def serializeTreeHelp(root, mfile):
268 if isinstance(root, Leaf):
269 mfile.write('\x00') #0x0
270 mfile.write(root.key)
271 return
272 mfile.write('\xff') #'\xff' is one character representing 0xff
273 mfile.write('\xfe') #0xfe
274 serializeTreeHelp(root.left, mfile)
275 serializeTreeHelp(root.right, mfile)
276 serializeTreeHelp(self.tree.root, self.outfile)
277
278
279 def __encodeFile(self):
280 '''
281 The second time of reading input file
282 translate the input file with encode and write the result to outfile
283 TODO can this be improved speed up?
284 just write \xff as \b 1111 1111 ? can this be possible so do not need
285 to caculate 255 than translate to \xff and write?
286 '''
287 self.infile.seek(0)
288 #save this pos we will write here later
289 pos = self.outfile.tell()
290 self.outfile.write(chr(0)) #store left bit
291 self.outfile.write(chr(0)) #if left bit !=0 this is the last byte
292 num = 0
293 i = 0;
294 while 1:
295 c = self.infile.read(1) #how about Chinese caracter? 2 bytes?
296 if c == '':
297 break
298 li = self.tree.dict[c]
299 for x in li:
300 num = (num << 1) + x
301 i += 1
302 if (i == 8):
303 self.outfile.write(chr(num))
304 num = 0
305 i = 0
306 #for all left bit we will fill with 0,and fil finally save left bit
307 #like the last is 11 wich has 6 bits left than will store the last
308 #byte as 1100,0000
309 leftBit = (8 - i)%8
310 if leftBit:
311 for j in range(i, 8):
312 num = (num << 1)
313
314 #just after the huffman tree sotre how many bits are left for last
315 #byte that is not used and filled with 0
316 self.outfile.seek(pos)
317 self.outfile.write(chr(leftBit)) #still wrong can't not read well
318 self.outfile.write(chr(num))
319 self.outfile.flush() #well need this, why? remember !!!!
320 #self.outfile.seek(0,2) #will not write success without this a bug???
321 #print self.outfile.read(1)
322
323
324
325 # def test(self):
326 # for k, v in self.dict.items():
327 # print k
328 # print v
329
330
331 class HuffmanTreeWriter(TreeWriter):
332 '''
333 draw a huffman tree to tree.png or user spcified file
334 For huffman debug only
335 '''
336 def writeHelp(self, root, A):
337 p = str(self.num)
338 self.num += 1
339
340 if isinstance(root, Leaf):
341 key = root.key #TODO '\n' wrong to fix
342 #key.replace('\n', '\\n')
343 #A.add_node(p, label = str(root.elem()) + r'\n' + key, shape = 'rect')
344 A.add_node(p, label = str(root.elem()) + r'\n', shape = 'rect')
345 return p
346
347 #if not a leaf for huffman tree it must both have left and right child
348 A.add_node(p, label = str(root.elem()))
349
350 q = self.writeHelp(root.left, A)
351 A.add_node(q, label = str(root.left.elem()))
352 A.add_edge(p, q, label = '0')
353
354 r = self.writeHelp(root.right, A)
355 A.add_node(r, label = str(root.right.elem()))
356 A.add_edge(p, r, label = '1')
357
358 l = str(self.num2)
359 self.num2 -= 1
360 A.add_node(l, style = 'invis')
361 A.add_edge(p, l, style = 'invis')
362 B = A.add_subgraph([q, l, r], rank = 'same')
363 B.add_edge(q, l, style = 'invis')
364 B.add_edge(l, r, style = 'invis')
365
366 return p #return key root node
367
368
369
370
371 if __name__ == '__main__':
372 #d = [chr(ord('a')+i) for i in range(13)]
373 #w = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41]
374 #list = []
375 #for i in range(13):
376 # list.append((d[i], w[i]))
377 #print(list)
378 #tree = HuffmanTreeForCompress(list)
379 #writer = HuffmanTreeWriter(tree)
380 #writer.write()
381 #tree.test()
382 import sys
383 if len(sys.argv) == 1:
384 inputFileName = 'test.log'
385 else:
386 inputFileName = sys.argv[1]
387 compress = Compress(inputFileName)
388 compress.compress()
389
390 decompress = Decompress(inputFileName + '.compress')
391 decompress.decompress()
392
393 #compress.test()
394
python 实现的huffman 编码压缩,解码解压缩相关推荐
- python Huffman编码及解码
Huffman编码及解码 # coding:utf-8#Tree-Node Type class Node:def __init__(self,freq):self.left = Noneself.r ...
- python中的URL编码和解码
python中的URL编码和解码:test.py 1 # 引入urllib的request模块 2 import urllib.request 3 4 url = 'https://www.douba ...
- 二十六、python中字符串的编码与解码,utf-8编码与解码,gbk编码与解码
在python中,我们经常会使用到字符串的编码与解码,推荐你在写代码的过程中都用utf-8编码解码 1.utf-8编码用函数encode,例子,编码英文跟数字的时候,他只是在前面多了个b,编码中文的时 ...
- python url解码_对python中url参数编码与解码的实例详解
一.简介 在python中url,对于中文等非ascii码字符,需要进行参数的编码与解码. 二.关键代码 1.url编码 对字符串编码用urllib.parse包下的quote(string, saf ...
- Huffman编码与解码
Huffman编码与解码 // @author: Folivora Li // @copyright Folivora Li/* 4.Huffman编码与解码 (必做)(Huffman编码.二叉树) ...
- Huffman编码压缩文件
文章目录 前言 一.Huffman编码是什么? 二.Huffman编码的实现方法 三.Huffman压缩文件 1.统计文件个字符出现的次数 2.生成Huffman树 3.生成码表 4.对文件进行压缩 ...
- python编码解码的过程_使用Python过程中的编码和解码
编码和解码的问题纠结了我很久了,对他一直只有是是而非的理解,好像是那么回事,但是又不懂,今天终于来认真解决一下这个问题,总结一下大神们的回答,做一下笔记. 首先,我们知道,计算机中的所有数据都以二进制 ...
- DS二叉树——Huffman编码与解码(不含代码框架)
题目描述 1.问题描述 给定n个字符及其对应的权值,构造Huffman树,并进行huffman编码和译(解)码. 构造Huffman树时,要求左子树根的权值小于.等于右子树根的权值. 进行Huffma ...
- huffman编码压缩c语言,用Huffman编码对文件进行压缩的C语言实现
本文介绍了采用Huffman编码对ASCII码文件进行压缩的基本原理,并用C语言程序实现了这个压缩过程.文中给出了比较完整的C语言程序代码,可以直接用于调试实验. 福 建电 脑 21 0 2年第 1期 ...
- 实验三Huffman编码与解码
一.实验原理 Huffman编码实现的数据结构 Huffman编码为可变长编码,若各码字长度按照所对应符号出现概率的大小逆序排列,则其平均长度最小. 编码步骤: 1.将信源符号按照出现概率由大到小的顺 ...
最新文章
- 被陆奇看好的项目都好奇葩
- 【290】Python 函数
- 自学python清单-我的2018学习清单
- 三.Hystrix资源隔离
- 神经网络有什么理论支持? 本文作者:AI研习社 2017-11-08 18:30 导语:问:神经网络有什么理论支持? 答:目前为止(2017 年)没有什么特别靠谱的。 雷锋网按:本文原作者袁洋
- [转载]根据两点的经纬度求方位角和距离,等
- 计算机主板上电源怎么插,教大家电脑主板上的电源开关插头怎么接
- python 聚类_聚类算法中的四种距离及其python实现
- 【Java并发编程】16、ReentrantReadWriteLock源码分析
- java体系的中间件适用于go吗_golang gf框架自定义中间件实现管理界面授权
- SQLPro Studio mac如何链接MYSQL?
- idea生成get/set方法
- OpenCV项目实战
- java编写数独_简单实现java数独游戏
- java游戏猿人时代_猿人时代BT版_JAVA游戏免费版下载_7723手机游戏[www.7723.cn]
- LC - P03 机器人大冒险
- nginx动静分离和资源隔离的网站搭建
- 计算机一级插入页码,同一篇文档中插入相同页码的小技巧
- Mac邮件客户端怎么添加QQ邮箱
- 你的声音价值百万,教你六个利用声音赚钱的方法