Huffman Coding
Huffman Coding is a widely used lossless data compression algorithm that is based on the frequency of symbols in the data. Developed by David A. Huffman in 1952, this algorithm assigns variable-length codes to input characters, with shorter codes assigned to more frequent characters and longer codes to less frequent characters.
Huffman coding is used in several compression formats, including ZIP files, JPEG images, and MP3 audio files, as it achieves a near-optimal data compression ratio. It is particularly efficient when the frequency distribution of the symbols in the dataset is uneven.
Huffman Coding works by following these steps:
Let's take an example of encoding the string: "ABRACADABRA".
Frequency Count:
Huffman Tree Construction:
Start with nodes containing the characters and their frequencies:
A(5), B(2), R(2), C(1), D(1).
Merge C and D (smallest frequencies), forming a node (C+D) with a frequency of 2.
Now the nodes are: A(5), B(2), R(2), (C+D)(2).
Merge B and R, forming a node (B+R) with a frequency of 4.
Now the nodes are: A(5), (B+R)(4), (C+D)(2).
Merge (C+D) and (B+R), forming a node (C+D+B+R) with a frequency of 6.
Now the nodes are: A(5), (C+D+B+R)(6).
Finally, merge A and (C+D+B+R), forming the root of the tree with a frequency of 11.
Now, let's look at how to implement Huffman Coding in both Python and C++.
import heapq
from collections import defaultdict
class Node:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other):
return self.freq < other.freq
def build_huffman_tree(text):
# Frequency of characters
frequency = defaultdict(int)
for char in text:
frequency[char] += 1
# Create a priority queue (min-heap)
heap = [Node(char, freq) for char, freq in frequency.items()]
heapq.heapify(heap)
while len(heap) > 1:
left = heapq.heappop(heap)
right = heapq.heappop(heap)
merged = Node(None, left.freq + right.freq)
merged.left = left
merged.right = right
heapq.heappush(heap, merged)
return heap[0]
def generate_codes(node, prefix='', codebook={}):
if node is not None:
if node.char is not None:
codebook[node.char] = prefix
generate_codes(node.left, prefix + '0', codebook)
generate_codes(node.right, prefix + '1', codebook)
return codebook
def huffman_encoding(text):
root = build_huffman_tree(text)
huffman_codes = generate_codes(root)
encoded_text = ''.join([huffman_codes[char] for char in text])
return encoded_text, huffman_codes
def huffman_decoding(encoded_text, huffman_codes):
reverse_codes = {v: k for k, v in huffman_codes.items()}
current_code = ''
decoded_text = ''
for bit in encoded_text:
current_code += bit
if current_code in reverse_codes:
decoded_text += reverse_codes[current_code]
current_code = ''
return decoded_text
# Example Usage
text = "ABRACADABRA"
encoded_text, huffman_codes = huffman_encoding(text)
print("Encoded Text:", encoded_text)
print("Huffman Codes:", huffman_codes)
decoded_text = huffman_decoding(encoded_text, huffman_codes)
print("Decoded Text:", decoded_text)
Explanation:
huffman_encoding
function encodes the string, and huffman_decoding
function decodes it back to the original string.
#include <iostream>
#include <queue>
#include <unordered_map>
#include <string>
using namespace std;
struct Node {
char ch;
int freq;
Node* left;
Node* right;
Node(char c, int f) : ch(c), freq(f), left(nullptr), right(nullptr) {}
};
struct Compare {
bool operator()(Node* left, Node* right) {
return left->freq > right->freq;
}
};
void generateCodes(Node* root, string str, unordered_map<char, string>& codes) {
if (root == nullptr)
return;
if (root->ch != '$') // If it's a leaf node
codes[root->ch] = str;
generateCodes(root->left, str + "0", codes);
generateCodes(root->right, str + "1", codes);
}
Node* buildHuffmanTree(const string& text) {
unordered_map<char, int> freq;
for (char c : text) {
freq[c]++;
}
priority_queue<Node*, vector<Node*>, Compare> pq;
for (auto& pair : freq) {
pq.push(new Node(pair.first, pair.second));
}
while (pq.size() > 1) {
Node* left = pq.top();
pq.pop();
Node* right = pq.top();
pq.pop();
Node* newNode = new Node('$', left->freq + right->freq);
newNode->left = left;
newNode->right = right;
pq.push(newNode);
}
return pq.top();
}
string huffmanEncoding(const string& text, unordered_map<char, string>& codes) {
Node* root = buildHuffmanTree(text);
generateCodes(root, "", codes);
string encodedText = "";
for (char c : text) {
encodedText += codes[c];
}
return encodedText;
}
string huffmanDecoding(const string& encodedText, unordered_map<char, string>& codes) {
unordered_map<string, char> reverseCodes;
for (auto& pair : codes) {
reverseCodes[pair.second] = pair.first;
}
string currentCode = "", decodedText = "";
for (char bit : encodedText) {
currentCode += bit;
if (reverseCodes.find(currentCode) != reverseCodes.end()) {
decodedText += reverseCodes[currentCode];
currentCode = "";
}
}
return decodedText;
}
int main() {
string text = "ABRACADABRA";
unordered_map<char, string> codes;
string encodedText = huffmanEncoding(text, codes);
cout << "Encoded Text: " << encodedText << endl;
cout << "Huffman Codes:" << endl;
for (auto& pair : codes) {
cout << pair.first << ": " << pair.second << endl;
}
string decodedText = huffmanDecoding(encodedText, codes);
cout << "Decoded Text: " << decodedText << endl;
return 0;
}
Explanation:
generateCodes
function traverses the tree to generate binary codes for each character.huffmanEncoding
and huffmanDecoding
functions encode and decode the string, respectively.Time Complexity:
Space Complexity: O(n), where n is the number of unique characters in the text.