/* COMPRESS_QMX.C -------------- Copyright (c) 2014 by Andrew Trotman Licensed BSD A version of BinPacking where we pack into a 128-bit SSE register the following: 256 0-bit words 128 1-bit words 64 2-bit words 40 3-bit words 32 4-bit words 24 5-bit words 20 6-bit words 16 8-bit words 12 10-bit words 8 16-bit words 4 32-bit words or pack into two 128-bit words (i.e. 256 bits) the following: 36 7-bit words 28 9-bit words 20 12-bit words 12 21-bit words or pach short sequences as: 1 32-bit word 1 24-bit word 1 16-bit word 1 8-bit word This gives 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer the selector). These reads are byte aligned. Short sequences are encoded using selectors 0xF0-0xFF. The top nybble is the indicator of a short sequence while the bottom is divided into 2 2-bit numbers, xxyy. xx is the type and yy is the run length. Possible types for xx are: 00 8-bit integer 01 16-bit integer 10 24-bit integer 11 32-bit integer value runlengths for yy are 00, 01, 10, 11. They are the 2's complement of the integer run-length (0-3). */ #include #include #include #include #include #include "compress_qmx.h" //#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ //#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ #define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ #define SHORT_END_BLOCKS 1 #ifdef _MSC_VER #define ALIGN_16 __declspec(align(16)) #else #define ALIGN_16 __attribute__ ((aligned (16))) #endif //#define STATS /* uncomment this and it will count the selector usage */ #ifdef STATS static uint32_t stats[65] = {0}; #endif /* ANT_COMPRESS_QMX::ANT_COMPRESS_QMX() ------------------------------------ */ ANT_compress_qmx::ANT_compress_qmx() { length_buffer = NULL; length_buffer_length = 0; } /* ANT_COMPRESS_QMX::~ANT_COMPRESS_QMX() ------------------------------------- */ ANT_compress_qmx::~ANT_compress_qmx() { delete [] length_buffer; #ifdef STATS uint32_t which; for (which = 0; which <= 32; which++) if (stats[which] != 0) printf("%d\t%d\ttimes\n", which, stats[which]); #endif } /* BYTES_NEEDED_FOR() ------------------ */ static uint8_t bytes_needed_for(uint32_t value) { if (value <= 0xFF) return 1; else if (value <= 0xFFFF) return 2; else if (value <= 0xFFFFFF) return 3; else return 4; } /* BITS_NEEDED_FOR() ----------------- */ static uint8_t bits_needed_for(uint32_t value) { if (value == 0x01) return 0; else if (value <= 0x01) return 1; else if (value <= 0x03) return 2; else if (value <= 0x07) return 3; else if (value <= 0x0F) return 4; else if (value <= 0x1F) return 5; else if (value <= 0x3F) return 6; else if (value <= 0x7F) return 7; else if (value <= 0xFF) return 8; else if (value <= 0x1FF) return 9; else if (value <= 0x3FF) return 10; else if (value <= 0xFFF) return 12; else if (value <= 0xFFFF) return 16; else if (value <= 0x1FFFFF) return 21; else return 32; } /* VBYTE_BYTES_NEEDED_FOR() ------------------------ */ static inline uint32_t vbyte_bytes_needed_for(uint32_t docno) { if (docno < (1 << 7)) return 1; else if (docno < (1 << 14)) return 2; else if (docno < (1 << 21)) return 3; else if (docno < (1 << 28)) return 4; else return 5; } /* VBYTE_COMPRESS_INTO() --------------------- NOTE: We compress "backwards" because we want to keep decompressing from the end of the string to get the number */ static inline void vbyte_compress_into(uint8_t *dest, uint32_t docno) { if (docno < (1 << 7)) dest[0] = (docno & 0x7F) | 0x80; else if (docno < (1 << 14)) { dest[1] = (docno >> 7) & 0x7F; dest[0] = (docno & 0x7F) | 0x80; } else if (docno < (1 << 21)) { dest[2] = (docno >> 14) & 0x7F; dest[1] = (docno >> 7) & 0x7F; dest[0] = (docno & 0x7F) | 0x80; } else if (docno < (1 << 28)) { dest[3] = (docno >> 21) & 0x7F; dest[2] = (docno >> 14) & 0x7F; dest[1] = (docno >> 7) & 0x7F; dest[0] = (docno & 0x7F) | 0x80; } else { dest[4] = (docno >> 28) & 0x7F; dest[3] = (docno >> 21) & 0x7F; dest[2] = (docno >> 14) & 0x7F; dest[1] = (docno >> 7) & 0x7F; dest[0] = (docno & 0x7F) | 0x80; } } /* VBYTE_DECOMPRESS() ------------------ NOTE: this method is given a ponter to the end of the v-byte compressed integer. The task is to work backwards until it gets the integer */ static inline uint32_t vbyte_decompress(uint8_t *source) { uint32_t result; if (*source & 0x80) return *source & 0x7F; else { result = *source--; while (!(*source & 0x80)) result = (result << 7) | *source--; return (result << 7) | (*source & 0x7F); } } /* WRITE_OUT() ----------- */ static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) { uint32_t current, batch; uint8_t *destination = *buffer; uint32_t *end = source + raw_count; uint8_t *key_store = *length_buffer; uint32_t ALIGN_16 sequence_buffer[4]; uint32_t instance, value; uint8_t type; uint32_t count; uint32_t max_bytes = 1; // this is the bytw-width for type128 encoded non-SSE integers #ifdef STATS stats[size_in_bits] += raw_count; #endif if (size_in_bits == 0) { type = 0; count = (raw_count + 255) / 256; } else if (size_in_bits == 1) { type = 1; // 1 bit per integer count = (raw_count + 127) / 128; } else if (size_in_bits == 2) { type = 2; // 2 bits per integer count = (raw_count + 63) / 64; } else if (size_in_bits == 3) { type = 3; // 3 bits per integer count = (raw_count + 39) / 40; } else if (size_in_bits == 4) { type = 4; // 4 bits per integer count = (raw_count + 31) / 32; } else if (size_in_bits == 5) { type = 5; // 5 bits per integer count = (raw_count + 23) / 24; } else if (size_in_bits == 6) { type = 6; // 6 bits per integer count = (raw_count + 19) / 20; } else if (size_in_bits == 7) { type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) count = (raw_count + 35) / 36; } else if (size_in_bits == 8) { type = 8; // 8 bits per integer count = (raw_count + 15) / 16; } else if (size_in_bits == 9) { type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) count = (raw_count + 27) / 28; } else if (size_in_bits == 10) { type = 10; // 10 bits per integer count = (raw_count + 11) / 12; } else if (size_in_bits == 12) { type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) count = (raw_count + 19) / 20; } else if (size_in_bits == 16) { type = 12; // 16 bits per integer count = (raw_count + 7) / 8; } else if (size_in_bits == 21) { type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) count = (raw_count + 11) / 12; } else if (size_in_bits == 32) { type = 14; // 32 bits per integer count = (raw_count + 3) / 4; } else if (size_in_bits == 128) { type = 15; count = raw_count; /* As the count for type 128 can only be 1, 2, or 3, we can re-appropriate it and store the bit-length in there too. */ max_bytes = 1; for (uint32_t integer = 0; integer < count; integer++) { if (bytes_needed_for(source[integer]) > max_bytes) max_bytes = bytes_needed_for(source[integer]); } } else exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); while (count > 0) { batch = count > 16 ? 16 : count; *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); count -= batch; for (current = 0; current < batch; current++) { switch (size_in_bits) { case 0: // 0 bits per integer (i.e. a long sequence of zeros) /* In this case we don't need to store a 4 byte integer because its implicit */ source += 256; break; case 1: // 1 bit per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 128; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); memcpy(destination, sequence_buffer, 16); destination += 16; source += 128; break; case 2: // 2 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 64; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); memcpy(destination, sequence_buffer, 16); destination += 16; source += 64; break; case 3: // 3 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 40; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); memcpy(destination, sequence_buffer, 16); destination += 16; source += 40; break; case 4: // 4 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 32; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); memcpy(destination, sequence_buffer, 16); destination += 16; source += 32; break; case 5: // 5 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 24; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); memcpy(destination, sequence_buffer, 16); destination += 16; source += 24; break; case 6: // 6 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 20; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); memcpy(destination, sequence_buffer, 16); destination += 16; source += 20; break; case 7: // 7 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 20; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); memcpy(destination, sequence_buffer, 16); destination += 16; memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 16; value < 20; value++) sequence_buffer[value & 0x03] |= source[value] >> 4; for (value = 20; value < 36; value++) sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); memcpy(destination, sequence_buffer, 16); destination += 16; source += 36; // 36 in a double 128-bit word break; case 8: // 8 bits per integer #ifdef SHORT_END_BLOCKS for (instance = 0; instance < 16 && source < end; instance++) #else for (instance = 0; instance < 16; instance++) #endif *destination++ = (uint8_t)*source++; break; case 9: // 9 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 16; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); memcpy(destination, sequence_buffer, 16); destination += 16; memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 12; value < 16; value++) sequence_buffer[value & 0x03] |= source[value] >> 5; for (value = 16; value < 28; value++) sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); memcpy(destination, sequence_buffer, 16); destination += 16; source += 28; // 28 in a double 128-bit word break; case 10: // 10 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 12; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); memcpy(destination, sequence_buffer, 16); destination += 16; source += 12; break; case 12: // 12 bit integers memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 12; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); memcpy(destination, sequence_buffer, 16); destination += 16; memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 8; value < 12; value++) sequence_buffer[value & 0x03] |= source[value] >> 8; for (value = 12; value < 20; value++) sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); memcpy(destination, sequence_buffer, 16); destination += 16; source += 20; // 20 in a double 128-bit word break; case 16: // 16 bits per integer #ifdef SHORT_END_BLOCKS for (instance = 0; instance < 8 && source < end; instance++) #else for (instance = 0; instance < 8; instance++) #endif { *(uint16_t *)destination = (uint16_t)*source++; destination += 2; } break; case 21: // 21 bits per integer memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 0; value < 8; value++) sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); memcpy(destination, sequence_buffer, 16); destination += 16; memset(sequence_buffer, 0, sizeof(sequence_buffer)); for (value = 4; value < 8; value++) sequence_buffer[value & 0x03] |= source[value] >> 11; for (value = 8; value < 12; value++) sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); memcpy(destination, sequence_buffer, 16); destination += 16; source += 12; // 12 in a double 128-bit word break; case 32: // 32 bits per integer #ifdef SHORT_END_BLOCKS for (instance = 0; instance < 4 && source < end; instance++) #else for (instance = 0; instance < 4; instance++) #endif { *(uint32_t *)destination = (uint32_t)*source++; destination += 4; } break; case 128: if (max_bytes == 1) { *(uint8_t *)destination = (uint8_t)*source; source++; destination += 1; *(key_store - 1) = (type << 4) | (~(batch - 1) & 0x03); } else if (max_bytes == 2) { *(uint16_t *)destination = (uint16_t)*source; source++; destination += 2; *(key_store - 1) = (type << 4) | 4 | (~(batch - 1) & 0x03); } else if (max_bytes == 3) { *destination++ = (uint8_t)((*source >> 16) & 0xFF); *destination++ = (uint8_t)((*source >> 8) & 0xFF); *destination++ = (uint8_t)((*source >> 0) & 0xFF); source++; *(key_store - 1) = (type << 4) | 8 | (~(batch - 1) & 0x03); } else if (max_bytes == 4) { *(uint32_t *)destination = (uint32_t)*source; source++; destination += 4; *(key_store - 1) = (type << 4) | 0x0C | (~(batch - 1) & 0x03); } else printf("max_bytes must be 1, 2, 3, or 4, but is:%d", (int)max_bytes); break; } } } *buffer = destination; *length_buffer = key_store; } /* MAX() ----- */ template T max(T a, T b) { return a > b ? a : b; } /* MAX() ----- */ template T max(T a, T b, T c, T d) { return max(max(a, b), max(c, d)); } /* ANT_COMPRESS_QMX::ENCODEARRAY() ------------------------------- */ void ANT_compress_qmx::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) { const uint32_t WASTAGE = 512; uint8_t *current_length, *destination = (uint8_t *)into, *keys; uint32_t *current, run_length, bits, new_needed, wastage; uint32_t block, largest; /* make sure we have enough room to store the lengths */ if (length_buffer_length < source_integers) { delete [] length_buffer; length_buffer = new uint8_t [(size_t)(length_buffer_length = source_integers) + WASTAGE]; } /* Get the lengths of the integers */ current_length = length_buffer; for (current = (uint32_t *)source; current < source + source_integers; current++) *current_length++ = bits_needed_for(*current); /* Shove a bunch of 0 length integers on the end to allow for overflow */ for (wastage = 0; wastage < WASTAGE; wastage++) *current_length++ = 0; /* Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned and therefore we need each compress "block" to be the same size where a compress "block" is a set of four encoded integers starting on a 4-integer boundary. */ for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); /* This code makes sure we can do aligned reads, promoting to larger integers if necessary */ current_length = length_buffer; while (current_length < length_buffer + source_integers) { #ifdef SHORT_END_BLOCKS /* If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes */ if (source_integers - (current_length - length_buffer) < 4) { largest = 0; for (block = 0; block < 8; block++) largest = max((uint8_t)largest, *(current_length + block)); if (largest <= 8) for (block = 0; block < 8; block++) *(current_length + block) = 8; else if (largest <= 16) for (block = 0; block < 8; block++) *(current_length + block) = 16; else if (largest <= 32) for (block = 0; block < 8; block++) *(current_length + block) = 32; } else if (source_integers - (current_length - length_buffer) < 8) { largest = 0; for (block = 0; block < 8; block++) largest = max((uint8_t)largest, *(current_length + block)); if (largest <= 8) for (block = 0; block < 8; block++) *(current_length + block) = 8; else if (largest <= 8) for (block = 0; block < 8; block++) *(current_length + block) = 16; } else if (source_integers - (current_length - length_buffer) < 16) { largest = 0; for (block = 0; block < 16; block++) largest = max((uint8_t)largest, *(current_length + block)); if (largest <= 8) for (block = 0; block < 16; block++) *(current_length + block) = 8; } /* Otherwise we have the standard rules for a block */ #endif /* Two things need to happen to be able to use a particular selector. The first is that all the values that would end up in that block need to use at most the bit value of that block. The second is that there need to be at least as many numbers remaining as the block encodes. For example, if the current block only needs 0-bits per int, then check that the 256 values that would be encoded only take 0-bits. If any value needs more, or there aren't 256 numbers remaining, then promote the current block to try encode 128 1-bit values. */ switch (*current_length) { case 0: if (source_integers - (current_length - length_buffer) < 256) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote break; } for (block = 0; block < 256; block += 4) if (*(current_length + block) > 0) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote if (*current_length == 0) { for (block = 0; block < 256; block++) current_length[block] = 0; current_length += 256; } break; case 1: if (source_integers - (current_length - length_buffer) < 128) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote break; } for (block = 0; block < 128; block += 4) if (*(current_length + block) > 1) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote if (*current_length == 1) { for (block = 0; block < 128; block++) current_length[block] = 1; current_length += 128; } break; case 2: if (source_integers - (current_length - length_buffer) < 64) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote break; } for (block = 0; block < 64; block += 4) if (*(current_length + block) > 2) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote if (*current_length == 2) { for (block = 0; block < 64; block++) current_length[block] = 2; current_length += 64; } break; case 3: if (source_integers - (current_length - length_buffer) < 40) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote break; } for (block = 0; block < 40; block += 4) if (*(current_length + block) > 3) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote if (*current_length == 3) { for (block = 0; block < 40; block++) current_length[block] = 3; current_length += 40; } break; case 4: if (source_integers - (current_length - length_buffer) < 32) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote break; } for (block = 0; block < 32; block += 4) if (*(current_length + block) > 4) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote if (*current_length == 4) { for (block = 0; block < 32; block++) current_length[block] = 4; current_length += 32; } break; case 5: if (source_integers - (current_length - length_buffer) < 24) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote break; } for (block = 0; block < 24; block += 4) if (*(current_length + block) > 5) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote if (*current_length == 5) { for (block = 0; block < 24; block++) current_length[block] = 5; current_length += 24; } break; case 6: if (source_integers - (current_length - length_buffer) < 20) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote break; } for (block = 0; block < 20; block += 4) if (*(current_length + block) > 6) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote if (*current_length == 6) { for (block = 0; block < 20; block++) current_length[block] = 6; current_length += 20; } break; case 7: if (source_integers - (current_length - length_buffer) < 36) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote break; } for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word if (*(current_length + block) > 7) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote if (*current_length == 7) { for (block = 0; block < 36; block++) current_length[block] = 7; current_length += 36; } break; case 8: if (source_integers - (current_length - length_buffer) < 16) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote break; } for (block = 0; block < 16; block += 4) if (*(current_length + block) > 8) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote if (*current_length == 8) { for (block = 0; block < 16; block++) current_length[block] = 8; current_length += 16; } break; case 9: if (source_integers - (current_length - length_buffer) < 28) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote break; } for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word if (*(current_length + block) > 9) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote if (*current_length == 9) { for (block = 0; block < 28; block++) current_length[block] = 9; current_length += 28; } break; case 10: if (source_integers - (current_length - length_buffer) < 12) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote break; } for (block = 0; block < 12; block += 4) if (*(current_length + block) > 10) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote if (*current_length == 10) { for (block = 0; block < 12; block++) current_length[block] = 10; current_length += 12; } break; case 12: if (source_integers - (current_length - length_buffer) < 20) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote break; } for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word if (*(current_length + block) > 12) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote if (*current_length == 12) { for (block = 0; block < 20; block++) current_length[block] = 12; current_length += 20; } break; case 16: if (source_integers - (current_length - length_buffer) < 8) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote break; } for (block = 0; block < 8; block += 4) if (*(current_length + block) > 16) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote if (*current_length == 16) { for (block = 0; block < 8; block++) current_length[block] = 16; current_length += 8; } break; case 21: if (source_integers - (current_length - length_buffer) < 12) { *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote break; } for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word if (*(current_length + block) > 21) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote if (*current_length == 21) { for (block = 0; block < 12; block++) current_length[block] = 21; current_length += 12; } break; case 32: if (source_integers - (current_length - length_buffer) < 4) { for (block = 0; block < (source_integers - (current_length - length_buffer)); block++) *(current_length + block) = 128; // promote break; } for (block = 0; block < 4; block += 4) if (*(current_length + block) > 32) *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote if (*current_length == 32) { for (block = 0; block < 4; block++) current_length[block] = 32; current_length += 4; } break; case 128: /* The 128-bit selector is used as a last resort when there are not enough numbers to use an earlier selector. So don't worry about checking the rest. */ current_length += source_integers - (current_length - length_buffer); break; default: exit(printf("Selecting on a non whole power of 2, must exit\n")); break; } } /* We can now compress based on the lengths in length_buffer */ run_length = 1; bits = length_buffer[0]; keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc for (current = (uint32_t *)source + 1; current < source + source_integers; current++) { new_needed = length_buffer[current - source]; if (new_needed == bits) run_length++; else { write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); bits = new_needed; run_length = 1; } } write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); /* Copy the lengths to the end */ memcpy(destination, length_buffer, keys - length_buffer); destination += keys - length_buffer; /* Add the pointer to the lengths */ uint32_t val = keys - length_buffer + vbyte_bytes_needed_for(keys - length_buffer); // offset (from the end) to the start of the keys if (vbyte_bytes_needed_for(val) > vbyte_bytes_needed_for(keys - length_buffer)) val = keys - length_buffer + vbyte_bytes_needed_for(val); // although rare, this happens when adding the length of the vbyte encoded length makes the vbyte encoding one byte longer (i.e. 127) vbyte_compress_into(destination, val); destination += vbyte_bytes_needed_for(val); /* Compute the length (in bytes) */ *nvalue = destination - (uint8_t *)into; // return length in bytes } #ifdef MAKE_DECOMPRESS /* The following program generates the source code for ANT_compress_qmx::decodeArray() */ /* MAIN() ------ This version assumes SSE4.1 and so it is *not* portable to non X86 architectures */ int main(void) { uint32_t instance; printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); printf("void ANT_compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); printf("{\n"); printf("__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); printf("uint8_t *in = (uint8_t *)source;\n"); printf("uint32_t *end = to + destination_integers;\n"); printf("uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1);\n"); printf("uint8_t *keys = (uint8_t *)source + len - key_start;\n"); printf("\n"); printf("mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);\n"); printf("mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);\n"); printf("mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);\n"); printf("mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);\n"); printf("mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);\n"); printf("mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);\n"); printf("mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);\n"); printf("mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);\n"); printf("mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);\n"); printf("mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);\n"); printf("mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);\n"); printf("\n"); printf("while (to < end)\n"); printf("\t{\n"); printf("\tswitch (*keys++)\n"); printf("\t\t{\n"); for (instance = 0; instance <= 0xFF; instance++) { printf("\t\tcase 0x%02x:\n", instance); if ((instance >> 4) == 0) { /* 256 0-bit integers */ printf("#ifdef NO_ZEROS\n"); printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)static_mask_1);\n"); printf("#else\n"); printf("\t\t\ttmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));\n"); printf("#endif\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 32, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 33, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 34, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 35, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 36, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 37, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 38, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 39, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 40, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 41, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 42, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 43, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 44, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 45, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 46, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 47, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 48, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 49, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 50, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 51, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 52, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 53, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 54, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 55, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 56, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 57, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 58, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 59, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 60, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 61, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 62, tmp);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 63, tmp);\n"); printf("\t\t\tto += 256;\n"); // becomes 256 integers } else if (instance >> 4 == 1) { /* 128 * 1-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 128;\n"); // becomes 128 integers } else if (instance >> 4 == 2) { /* 64 * 2-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 64;\n"); // becomes 64 integers } else if (instance >> 4 == 3) { /* 40 * 3-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 40;\n"); // becomes 40 integers } else if (instance >> 4 == 4) { /* 32 * 4-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 32;\n"); // becomes 32 integers } else if (instance >> 4 == 5) { /* 24 * 5-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 24;\n"); // becomes 24 integers } else if (instance >> 4 == 6) { /* 20 * 6-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 20;\n"); // becomes 20 integers } else if (instance >> 4 == 7) { /* 36 * 7 bit integers (in two 128-bit words) */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 3);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));\n"); printf("\t\t\tin += 32;\n"); // 32 bytes printf("\t\t\tto += 36;\n"); // becomes 36 integers } else if (instance >> 4 == 8) { /* 16 * 8-bit integers */ printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));\n"); printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));\n"); printf("\t\t\ttmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));\n"); printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 16;\n"); // becomes 16 integers } else if (instance >> 4 == 9) { /* 28 * 9-bit ingtegers (in two 128-bit words) */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));\n"); printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 4);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));\n"); printf("\t\t\tin += 32;\n"); // 32 bytes printf("\t\t\tto += 28;\n"); // becomes 28 integers } else if (instance >> 4 == 10) { /* 12 * 10-bit integers */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 12;\n"); // becomes 12 integers } else if (instance >> 4 == 11) { /* 20 * 12-bit ingtegers (in two 128-bit words) */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));\n"); printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 8);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));\n"); printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));\n"); printf("\t\t\tin += 32;\n"); // 32 bytes printf("\t\t\tto += 20;\n"); // becomes 20 integers } else if (instance >> 4 == 12) { /* 16-bit integers */ printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 8;\n"); // becomes 8 integers } else if (instance >> 4 == 13) { /* 12 * 21-bit ingtegers (in two 128-bit words) */ printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));\n"); printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));\n"); printf("\t\t\tin += 32;\n"); // 32 bytes printf("\t\t\tto += 12;\n"); // becomes 8 integers } else if (instance >> 4 == 14) { /* 32-bit integers */ printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); printf("\t\t\tin += 16;\n"); // 16 bytes printf("\t\t\tto += 4;\n"); // becomes 4 integers } else if (instance >> 4 == 15) { /* 128-bit integers if there are fewer than 4 integes then we just bit-pack them in to 8, 16, 24, or 32-bit words */ if ((instance & 0x0C) == 0x00) { printf("\t\t\t*to = *(uint8_t *)in;\n"); printf("\t\t\tin += 1;\n"); // 1 byte integer printf("\t\t\tto += 1;\n"); // becomes 1 integer } else if ((instance & 0x0C) == 0x04) { printf("\t\t\t*to = *(uint16_t *)in;\n"); printf("\t\t\tin += 2;\n"); // 2 byte integers printf("\t\t\tto += 1;\n"); // becomes 1 integer } else if ((instance & 0x0C) == 0x08) { printf("\t\t\t*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));\n"); printf("\t\t\tin += 3;\n"); // 3 byte integer printf("\t\t\tto += 1;\n"); // becomes 1 integer } else if ((instance & 0x0C) == 0x0C) { printf("\t\t\t*to = *(uint32_t *)in;\n"); printf("\t\t\tin += 4;\n"); // 4 byte integer printf("\t\t\tto += 1;\n"); // becomes 1 integer } if (instance == 0xFF || instance == 0xFB || instance == 0xF7 || instance == 0xF3) printf("\t\t\tbreak;\n"); } else { printf("\t\t\tin++;\n"); // dummy, can't occur } if ((instance & 0xF) == 0xF) printf("\t\t\tbreak;\n"); // every 32 instances we break (its the end of the fall through) } printf("\t\t}\n"); printf("\t}\n"); printf("}\n"); } #endif #ifdef TEST_ONE_STRING static uint32_t sequence[]={0x80, 0x80FF, 0x80FFFF}; static uint32_t sequence_unused[]={13,1,1,26,18,3,1,9,4,8,5,19,7,26,1,5,7,3,12,5,39,16,3,5,19,8,18,1,1,1,2,5,9,3,21,2,6,37,3,5,5,18,3,31,3,22,5,17,6,12,6,2,5,10,3,12,51,14,7,8,1,2,3,27,19,1,10,8,2,7,2,9,16,6,6,5,6,4,18,21,13,2,1,11,3,22,2,16,13,61,21,12,51,10,6,31,14,65,15,82,5,4,18,3,1,1,4,34,5,9,4,7,1,25,17,52,60,8,8,4,22,7,49,26,2,72,29,33,6,11,3,8,1,23,37,1,3,1,1,1,3,20,6,1,2,1,1,1,14,2,4,1,6,4,4,3,1,1,2,2,1,9,29,1,10,11,4,10,31}; static uint32_t second_compress_buffer[100000]; static uint32_t second_decompress_buffer[100000]; uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); /* CHECK() ------- */ void check(uint32_t *sequence, uint32_t sequence_length) { ANT_compress_qmx compressor; uint64_t buffer_size; uint32_t pos; uint32_t fail; memset(second_compress_buffer, 0, second_compress_buffer_size); memset(second_decompress_buffer, 0, second_decompress_buffer_size); compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); printf("%u integers became %u bytes\n", sequence_length, buffer_size); second_compress_buffer[buffer_size] = 0; second_compress_buffer[buffer_size + 1] = 0; second_compress_buffer[buffer_size + 2] = 0; second_compress_buffer[buffer_size + 3] = 0; for (pos = 0; pos < buffer_size; pos++) printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); puts(""); compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); fail = false; for (pos = 0; pos < sequence_length; pos++) if (sequence[pos] != second_decompress_buffer[pos]) { printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); fail = true; } else printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); if (fail) puts("Test failed"); else puts("Test succeeded"); } /* MAIN() ------ */ int main(void) { check(sequence, sizeof(sequence) / sizeof(*sequence)); } #endif /* ANT_COMPRESS_QMX::DECODEARRAY() -------------------------------- this code was generated by the method above. */ #include "compress_qmx_decompress.cpp"