CanonicalCode.java

package net.sourceforge.plantuml.code.deflate;

/* 
 * Simple DEFLATE decompressor
 * Copyright (c) Project Nayuki
 * 
 * https://www.nayuki.io/page/simple-deflate-decompressor
 * https://github.com/nayuki/Simple-DEFLATE-decompressor
 */

import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;

/**
 * A canonical Huffman code, where the code values for each symbol is derived
 * from a given sequence of code lengths. This data structure is immutable. This
 * could be transformed into an explicit Huffman code tree.
 * <p>
 * Example:
 * </p>
 * 
 * <pre>
 *   Code lengths (canonical code):
 *    Symbol A: 1
 *    Symbol B: 0 (no code)
 *    Symbol C: 3
 *    Symbol D: 2
 *    Symbol E: 3
 *  
 *  Generated Huffman codes:
 *    Symbol A: 0
 *    Symbol B: (Absent)
 *    Symbol C: 110
 *    Symbol D: 10
 *    Symbol E: 111
 *  
 *  Huffman code tree:
 *      .
 *     / \
 *    A   .
 *       / \
 *      D   .
 *         / \
 *        C   E
 * </pre>
 */
final class CanonicalCode {

	/*
	 * These arrays store the Huffman codes and values necessary for decoding.
	 * symbolCodeBits contains Huffman codes, each padded with a 1 bit at the
	 * beginning to disambiguate codes of different lengths (e.g. otherwise we can't
	 * distinguish 0b01 from 0b0001). Each symbolCodeBits[i] decodes to its
	 * corresponding symbolValues[i]. Values in symbolCodeBits are strictly
	 * increasing.
	 * 
	 * For the example of codeLengths=[1,0,3,2,3], we would have: i |
	 * symbolCodeBits[i] | symbolValues[i] --+-------------------+---------------- 0
	 * | 0b1_0 | 0 1 | 0b1_10 | 3 2 | 0b1_110 | 2 3 | 0b1_111 | 4
	 */
	private int[] symbolCodeBits;
	private int[] symbolValues;

	/**
	 * Constructs a canonical Huffman code from the specified array of symbol code
	 * lengths. Each code length must be non-negative. Code length 0 means no code
	 * for the symbol. The collection of code lengths must represent a proper full
	 * Huffman code tree.
	 * <p>
	 * Examples of code lengths that result in correct full Huffman code trees:
	 * </p>
	 * <ul>
	 * <li>[1, 1] (result: A=0, B=1)</li>
	 * <li>[2, 2, 1, 0, 0, 0] (result: A=10, B=11, C=0)</li>
	 * <li>[3, 3, 3, 3, 3, 3, 3, 3] (result: A=000, B=001, C=010, ..., H=111)</li>
	 * </ul>
	 * <p>
	 * Examples of code lengths that result in under-full Huffman code trees:
	 * </p>
	 * <ul>
	 * <li>[0, 2, 0] (result: B=00, unused=01, unused=1)</li>
	 * <li>[0, 1, 0, 2] (result: B=0, D=10, unused=11)</li>
	 * </ul>
	 * <p>
	 * Examples of code lengths that result in over-full Huffman code trees:
	 * </p>
	 * <ul>
	 * <li>[1, 1, 1] (result: A=0, B=1, C=overflow)</li>
	 * <li>[1, 1, 2, 2, 3, 3, 3, 3] (result: A=0, B=1, C=overflow, ...)</li>
	 * </ul>
	 * 
	 * @param codeLengths array of symbol code lengths (not {@code null})
	 * @throws NullPointerException     if the array is {@code null}
	 * @throws IllegalArgumentException if any element is negative, any value
	 *                                  exceeds MAX_CODE_LENGTH, or the collection
	 *                                  of code lengths would yield an under-full or
	 *                                  over-full Huffman code tree
	 */
	public CanonicalCode(int[] codeLengths) {
		// Check argument values
		Objects.requireNonNull(codeLengths);
		for (int x : codeLengths) {
			if (x < 0)
				throw new IllegalArgumentException("Negative code length");
			if (x > MAX_CODE_LENGTH)
				throw new IllegalArgumentException("Maximum code length exceeded");
		}

		// Allocate code values to symbols. Symbols are processed in the order
		// of shortest code length first, breaking ties by lowest symbol value.
		symbolCodeBits = new int[codeLengths.length];
		symbolValues = new int[codeLengths.length];
		int numSymbolsAllocated = 0;
		int nextCode = 0;
		for (int codeLength = 1; codeLength <= MAX_CODE_LENGTH; codeLength++) {
			nextCode <<= 1;
			int startBit = 1 << codeLength;
			for (int symbol = 0; symbol < codeLengths.length; symbol++) {
				if (codeLengths[symbol] != codeLength)
					continue;
				if (nextCode >= startBit)
					throw new IllegalArgumentException("This canonical code produces an over-full Huffman code tree");

				symbolCodeBits[numSymbolsAllocated] = startBit | nextCode;
				symbolValues[numSymbolsAllocated] = symbol;
				numSymbolsAllocated++;
				nextCode++;
			}
		}
		if (nextCode != 1 << MAX_CODE_LENGTH)
			throw new IllegalArgumentException("This canonical code produces an under-full Huffman code tree");

		// Trim unused trailing elements
		symbolCodeBits = Arrays.copyOf(symbolCodeBits, numSymbolsAllocated);
		symbolValues = Arrays.copyOf(symbolValues, numSymbolsAllocated);
	}

	/**
	 * Decodes the next symbol from the specified bit input stream based on this
	 * canonical code. The returned symbol value is in the range [0,
	 * codeLengths.length).
	 * 
	 * @param in the bit input stream to read from
	 * @return the next decoded symbol
	 * @throws IOException if an I/O exception occurred
	 */
	public int decodeNextSymbol(BitInputStream in) throws IOException {
		Objects.requireNonNull(in);
		int codeBits = 1; // The start bit
		while (true) {
			// Accumulate one bit at a time on the right side until a match is
			// found in the symbolCodeBits array. Because the Huffman code tree is
			// full, this loop must terminate after at most MAX_CODE_LENGTH iterations.
			codeBits = codeBits << 1 | in.readNoEof();
			int index = Arrays.binarySearch(symbolCodeBits, codeBits);
			if (index >= 0)
				return symbolValues[index];
		}
	}

	/**
	 * Returns a string representation of this canonical code, useful for debugging
	 * only, and the format is subject to change.
	 * 
	 * @return a string representation of this canonical code
	 */
	public String toString() {
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < symbolCodeBits.length; i++) {
			sb.append(String.format("Code %s: Symbol %d%n", Integer.toBinaryString(symbolCodeBits[i]).substring(1),
					symbolValues[i]));
		}
		return sb.toString();
	}

	// The maximum Huffman code length allowed in the DEFLATE standard.
	private static final int MAX_CODE_LENGTH = 15;

}