valdirunars/bigintcompress
An algorithm designed for compressing large collections of elements each having only a few possible values. E.g types representable by an `enum`.
The Algorithm
public func encode() -> Data? {
var number: CompressionNumber = 0
for element in self {
number = number * Self.maxUniqueComponentCount + Self.compressionNumberValue(for: element)
}
let string = number.hexString
let mutableData = NSMutableData()
var i = 0
while i < (string.count - 1) {
if let byte = string.getHexByte(startIndex: i, endIndex: i+2) {
mutableData.append(Data(repeating: byte, count: 1))
}
i += 2
}
if string.count % 2 == 1,
let byte = string.getHexByte(startIndex: string.count-1, endIndex: string.count) {
mutableData.append(Data(repeating: byte, count: 1))
}
let finalData = NSMutableData()
var count = self.count
let countData = Data(bytes: &count,
count: sizeOfInt)
finalData.append(countData)
finalData.append(mutableData as Data)
return finalData as Data
}How to Use
Swift Package Manager
.package(url: "https://github.com/valdirunars/BigIntCompress.git", from: "1.0.0"),Making your collection Compressable
First we must find a big integer type and make it conform to BigIntCompress' BigIntType there are many sufficient swift libraries out there. I usually use BigInt
extension BigInt: BigIntType {
public var hexString: String {
return String(self, radix: 16)
}
public init?(hexString: String) {
self.init(hexString, radix: 16)
}
public init<T>(_ value: T) where T : Numeric {
self.init(value)
}
}Great! Now we are ready to be Compressable. Lets say that we are working with strings where the possible values for each character are A, C, G & T, like in the case of a DNA's nucleotide.
extension String: Compressable {
public typealias CompressionNumber = BigInt
public static var possibleComponents: [Element] {
return [ "A", "C", "G", "T" ]
}
public static func single(_ element: Element) -> String {
return "\(element)"
}
}Now String has two new properties. A static variable bic for decoding and an instance variable with an identical name for encoding.
Encoding and decoding
let expected = """
CCAAGGATTTCCAAGGATTTTTCTCCACTGTTCTCCACTGTTCTCCACTGACAACCCTGGCCACGTATTCTCCACTGGCCACGTAACAACCCTGGCCACGTACCAAGGATTTGGACGGCTCCCCAAGGATTTGGACGGCTCCGCCACGTAGCCACGTATTCTCCACTGACAACCCTGACAACCCTGGCCACGTAGGACGGCTCCACAACCCTGCCAAGGATTTTTCTCCACTGGCCACGTATTCTCCACTGGGACGGCTCCGGACGGCTCCCCAAGGATTTGCCACGTATTCTCCACTGGGACGGCTCCGCCACGTAGGACGGCTCCGGACGGCTCCCCAAGGATTTTTCTCCACTGGGACGGCTCCTTCTCCACTGCCAAGGATTTCCAAGGATTTGCCACGTACCAAGGATTTGGACGGCTCCGCCACGTAGCCACGTACCAAGGATTTCCAAGGATTTGGACGGCTCCTTCTCCACTGTTCTCCACTGCCAAGGATTTTTCTCCACTGGGACGGCTCCACAACCCTGGGACGGCTCCACAACCCTGTTCTCCACTGTTCTCCACTGTTCTCCACTGCCAAGGATTTGGACGGCTCCCCAAGGATTTTTCTCCACTGTTCTCCACTGGGACGGCTCCGCCACGTAGGACGGCTCCACAACCCTGTTCTCCACTGGCCACGTAGCCACGTAACAACCCTGGCCACGTAACAACCCTGCCAAGGATTTCCAAGGATTTCCAAGGATTTACAACCCTGGCCACGTAGGACGGCTCCGCCACGTATTCTCCACTGGCCACGTAACAACCCTGGCCACGTAACAACCCTGGCCACGTAGCCACGTAGCCACGTATTCTCCACTGGGACGGCTCCCCAAGGATTTGCCACGTAGGACGGCTCCTTCTCCACTGGGACGGCTCCGCCACGTAACAACCCTGTTCTCCACTGGCCACGTACCAAGGATTTCCAAGGATTTGGACGGCTCCCCAAGG
"""
let compressed = expected.bic.encode()
let back = try! String.bic.decode(compressed!)!
XCTAssert(back == expected)Performance
let asciiData: Data! = expected.data(using: .ascii)
XCTAssert(compressed.count * 3 < asciiData.count)Disadvantages
The algorithm is lossy in the sense that it focuses primarily on a collection's object and thus looses metadata of those collections. BigIntCompress could be implemented to support this appending metadata to the compressed data, in fact it is on my TODO list.
But for now, it is only feasable for compressing "raw" data formats
Package Metadata
Repository: valdirunars/bigintcompress
Default branch: master
README: README.md