The start idea comes from the Stern-Brocot tree which contains fractions.
I used this tree and put on it's leafs, binary labels.
With this new structure I observed that on the exterior of the tree I obtain fractions with small values for nominator and denominator which has a longer binary way formed by labels (for example the fraction 1/100 has 100 of 0).
I propose to use these codes and use instead of them, the obtained Stern-Brocot fraction.
The Motivation of the proposed Algorithm
The proposed algorithm uses the Calkin-Wilf tree instead of Stern-Brocot tree due to the following reasons:
- in the decompression part the fractions can be produced faster by applying in an inverse way the construction algorithm: taking into consideration a fraction a/b, if it's value is bigger than 1 then the fraction from the root node contains the value (a-b)/b and otherwise, the root fraction will be a/(b-a).
The steps of the proposed algorithm
- reading sequentially the binary representations of the ASCII codes of the input chars
- the construction with these bits of the road in the Calkin-Wilf tree
- in case that is obtained a fraction with nominator or denominator which can be represent on 32 bits, than:
- if the length of the way in the tree is longer than 64 than in the compressed file is written the fraction
- otherwise the original bits are copied in the compressed file
Examples of the results obtained on some classic files
Lena original Lena negative Lena black-white Lena grayscale Lena odd lines
File name
|
Original Size (Bytes)
|
Number of Gained BITS
|
Lena original (gray)
|
786.486
|
- 22.416
|
Lena negativ
|
786.486
|
- 22.472
|
Lena 512 colors
|
786.486
|
- 1.306
|
Lena black-white
|
32.830
|
+ 74.369
|
Lena grayscale
|
263.222
|
- 12.765
|
Lena – odd lines
|
496.694
|
+ 2.124.584 / 8 = 265.573 Bytes
|
Examples of the algorithm on some classic files
File name
|
Original size (Bytes)
|
Number of GAINED BITS
|
RAR archive size
|
Notepad.exe
Version 5.1 (Build 2600.xpsp_sp2_rtm.040803-2158: SP2)
|
69.120
|
73.371 / 8 = 9171 bytes
|
33.530 Bytes
|
Regedit.exe
Version 5.1 (Build 2600.xpsp_sp2_rtm.040803-2158: SP2)
|
146.432
|
95.412 / 8 = 11.926 bytes
|
53.424 Bytes
|
TASKMAN.exe
|
15.360
|
15.696 / 8 = 1962 bytes
|
7.169 Bytes
|
Winhelp.exe
|
256.192
|
-13.358
|
125.426 Bytes
|
Twain.dll
Version 1.7.0.0
|
94.784
|
+6.120
|
36.941 Bytes
|
Firefox.exe
Version 1.9.2.3828, Product version 3.6.6
|
910.296
|
2.065.515 / 8 = 258.189 bytes
|
153.137 Bytes
|
Remarks
Running for many times the algorithm, I observed the following:
- the proposed algo do not realize all the time a compression of a file
- from the file that can be compressed better, there are: black-white pictures, exe file or files that contains many NUL chars or many chars with ASCII code 0xFF.
It is interesting to study if such tree can be generating for obtaining as much as many reductible fractions for using this proposed algo and combining it with others compression algorithms.
The next version of code will contain instead of copying the original bits, an classic compression method.
BETA Source code for the Proposed COMPRESSION Algorithm
# include "conio.h"
# include "stdio.h"
# include "string.h"
# define MAXVALUE 0x7FFFFFFF
typedef unsigned long int UL_int;
FILE * pf,* rez_aux, * rez;
UL_int NOMINATOR, DENOMINATOR;
UL_int ch_afis = 0;
UL_int in_size;
UL_int nCHAR_read = 0;
UL_int tc_length = 0;
UL_int number_ch_written = 0;
long c = -1;
long GAIN = 0;
int ct8 = 0;
int i = 0;
int rb;
int tcc = 1;
int tpc = 1;
int write_char = 0;
int ch_read;//The ASCII code of the character
char bin_asc[9];
char sir_length[33];
char FRAGMENT_CHAR[(MAXVALUE>>1)];
char write_contor = 0;
/*----------------------------------------------------*/
void Write_In_File(char * sir)
{
UL_int k_var;
UL_int length = 0;
length = strlen(sir);
for(k_var=0;k_var
{
write_char = write_char * 2;
if(sir[k_var] == '1')
{
write_char = write_char + 1;
}
write_contor++;
if((write_contor>0)&&((write_contor % 8)==0))
{
fputc(write_char,rez);
write_contor = 0;
write_char = 0;
number_ch_written ++;
}
}
}
void write_progress()
{
ch_afis ++;
if( (ch_afis % 1000 ) == 0 )
{
printf("\b");
switch( (ch_afis / 1000) % 4)
{
case 0: printf("-"); break;
case 1: printf("\\"); break;
case 2: printf("|"); break;
case 3: printf("/"); break;
}
printf("%7.3f\%%",(((double)nCHAR_read/in_size)*100));
printf("\b\b\b\b\b\b\b\b");
}
}
void Write_Original_Bits()
{
fprintf(rez_aux,"#0 LENGTH = %28lu", tc_length);
fprintf(rez_aux," GAIN = %10ld",GAIN);
fprintf(rez_aux," TOTAL = %10lu\n",8*in_size);
Write_In_File(FRAGMENT_CHAR);
}
void Write_Fractions(UL_int NOM, UL_int DENOM)
{
int k;
//Write the "bookmark"
Write_In_File("1");
for(k=31;k>=0;k--)
if( ((NOM & (1<> k) == 1 )
Write_In_File("1");
else if( ((NOM & (1<> k) == 0 )
Write_In_File("0");
for(k=31;k>=0;k--)
if( ((DENOM & (1<>k) == 1 )
Write_In_File("1");
else if( ((DENOM & (1<>k) == 0 )
Write_In_File("0");
}
/*----------------------------------------------------------------------*/
/* Function for the generating the "Calkin-Wilf" tree of "c" level */
/*----------------------------------------------------------------------*/
void CW_tree()
{
Again:;
c = c+1;
if( !feof(pf) )
{
//The NOMINATOR or DENOMINATOR has 32 bits
if( (NOMINATOR>MAXVALUE )||( DENOMINATOR>MAXVALUE) )
{
//The fractions should be written in the file
if( c > 64 )
{
tcc = 1;
if((tcc!=tpc))
{
//Write the "bookmark"
Write_In_File("0");
tc_length = strlen(FRAGMENT_CHAR) - c;
sir_length[0] = '\0';
for(int i=31;i>=0;i--)
if(((tc_length&(1<> i) == 1)
sir_length[31-i] = '1';
else if(((tc_length&(1<> i) == 0)
sir_length[31-i] = '0';
sir_length[32] = '\0';
GAIN = GAIN - 33;
Write_In_File(sir_length);
sir_length[0] = '\0';
//Modify here the FRAGMENT_CHAR: shorter with "c" position
FRAGMENT_CHAR[tc_length] = '\0';
Write_Original_Bits();
}
FRAGMENT_CHAR[0] = '\0';
Write_Fractions(NOMINATOR,DENOMINATOR);
GAIN = GAIN + c - 65;
fprintf(rez_aux,"#1 %10lu/%10lu",NOMINATOR,DENOMINATOR);
fprintf(rez_aux," H_Tree = %5ld ",c);
fprintf(rez_aux," GAIN = %10ld",GAIN);
fprintf(rez_aux," TOTAL = %10lu\n",8*in_size);
}
//The ORIGINAL bits should be copied in the file
else
{
tcc = 0;
}
tpc = tcc;
NOMINATOR = 1;
DENOMINATOR = 1;
c = -1;
}
//END of "The DENOMINATOR or DENOMINATOR has 32 bits"
//******************************************************
//should read a new character from the file
if( ct8 == 0 )
{
ch_read=fgetc(pf);
nCHAR_read ++;
write_progress();
for(int i=7;i>=0;i--)
if(((ch_read&(1<> i) == 1)
bin_asc[i] = '1';
else if(((ch_read&(1<> i) == 0)
bin_asc[i] = '0';
bin_asc[8] = '\0';
}
if(bin_asc[ct8] == '0')
{
DENOMINATOR = NOMINATOR + DENOMINATOR;
ct8 ++;
ct8 = ct8 % 8;
strcat(FRAGMENT_CHAR,"0");
goto Again;
}
else if(bin_asc[ct8] == '1')
{
NOMINATOR = NOMINATOR + DENOMINATOR;
ct8 ++;
ct8 = ct8 % 8;
strcat(FRAGMENT_CHAR,"1");
goto Again;
}
}// if( !feof(pf) )
}
void main(void)
{
printf(" CWT Compress: using the Calkin-Wilf Tree\n");
printf("============ Author: BOCUT Adrian Sebastian, 13.07.2010, Ver.1.0 ============\n");
printf("\n");
printf("Start...");
FRAGMENT_CHAR[0] = '\0';
pf = fopen("in.txt","rb");
rez_aux = fopen("rez_aux.txt","wb");
rez = fopen("rez.txt","wb");
fseek(pf, 0L, SEEK_END);
in_size = ftell(pf);
fseek(pf, 0L, SEEK_SET);
ct8 = 0;
/* The root is treated separatly */
NOMINATOR= 1;
DENOMINATOR = 1;
/* Generation of the CALKIN-WILF tree fractions */
//This is the level of the current fraction in the tree
c= -1;
CW_tree();
if(FRAGMENT_CHAR[0]!='\0')
{
Write_In_File("0");
tc_length = strlen(FRAGMENT_CHAR);
for(int i=31;i>=0;i--)
if(((tc_length&(1<>i) == 1)
sir_length[i] = '1';
else if(((tc_length&(1<>i) == 0)
sir_length[i] = '0';
sir_length[32] = '\0';
GAIN = GAIN - 33;
Write_In_File(sir_length);
Write_Original_Bits();
//remaining bits
rb = write_contor;
for(i=0;i<8-rb;i++)
Write_In_File("0");
fprintf(rez,"%c%c",rb,rb);
number_ch_written ++;
number_ch_written ++;
nCHAR_read = in_size;
ch_afis = 1000-1;
write_progress();
FRAGMENT_CHAR[0] = '\0';
}
fprintf(rez_aux,"#0 GAIN = %10ld",GAIN);
fprintf(rez_aux," TOTAL = %10lu\n",8*in_size);
fprintf(rez_aux,"\n\nNumber of read chars = %lu\n",nCHAR_read);
fprintf(rez_aux,"\n\nNumber of written chars = %lu\n",number_ch_written);
printf("\n\nNumber of saved bits = %ld\n",GAIN);
printf("\n\nNumber of read chars = %lu\n",nCHAR_read);
printf("\n\nNumber of written chars = %lu\n",number_ch_written);
printf("\b\nEND...\n\a");
fflush(rez);
fclose(rez);
fclose(rez_aux);
fclose(pf);
getche();
}
BETA Source code for the Proposed DE-COMPRESSION Algorithm
# include "stdio.h"
# include "conio.h"
# include "string.h"
# define MAXVALUE 0x7FFFFFFF
typedef unsigned long int UL_int;
FILE * file_in, * file_out;
UL_int ch_write_progress = 0;
UL_int nCHAR_read = 0;
UL_int file_in_size;
UL_int NOMINATOR = 0, DENOMINATOR = 0;
UL_int x,y;
int write_char = 0;
int new_char = 0;
int aux_new_char = 0;
char ch_read = 0;
char bit = 0;
char contor_read_bits = 0;
char array_fraction[1000000];
char write_contor = 0;
int first = 0;
/**********************************************************/
void Write_In_File(char * array)
{
UL_int length = 0;
int i,j;
length = strlen(array);
for(i=0;i
{
write_char = write_char * 2;
if(array[i] == '1')
write_char = write_char + 1;
write_contor ++;
if((write_contor % 8) == 0)
{
//write_char = write_char >> 1;
new_char = 0;
for(j=0;j<8;j++)
{
new_char = new_char * 2;
if((write_char % 2) == 1)
new_char = new_char + 1;
//new_char <<= 1;
write_char /= 2;
}
//new_char >>= 1;
fputc(new_char,file_out);
fflush(file_out);
aux_new_char = new_char;
new_char = 0;
write_contor = 0;
write_char = 0;
}
}
}
void read_from_file();
void write_progress()
{
ch_write_progress ++;
if( (ch_write_progress % 2 ) == 0 )
{
printf("\b");
switch( (ch_write_progress / 1000) % 4)
{
case 0: printf("-"); break;
case 1: printf("\\"); break;
case 2: printf("|"); break;
case 3: printf("/"); break;
}
printf("%7.3f\%%",(((double)nCHAR_read/file_in_size)*100));
printf("\b\b\b\b\b\b\b\b");
}
}
char read_one_bit_from_file()
{
char b = 0;
//Test if a new character should be read from the file
if(contor_read_bits == 0)
{
if(!feof(file_in))
ch_read = fgetc(file_in);
nCHAR_read ++;
write_progress();
}
//Provide the current bit
b = (ch_read & (1<<(7-contor_read_bits))) >> (7-contor_read_bits);
contor_read_bits ++;
contor_read_bits = contor_read_bits % 8;
return b;
}
void write_byte_sequence()
{
UL_int length;
UL_int k;
char i,b;
length = 0;
//Read 32 bits for length
for(i=31;i>=0;i--)
{
b = read_one_bit_from_file();
if(b == 1)
length = length | (1<
}
//Read from the file, "length" bytes
for(k=0;k
{
b = read_one_bit_from_file();
if(b == 1)
Write_In_File("1");
else
Write_In_File("0");
}
read_from_file();
}
void write_byte_sequence_from_fraction()
{
UL_int valid_length;
long double magnitude = 0.0;
char i,b;
NOMINATOR = 0;
DENOMINATOR = 0;
//Read 32 bits for NOMINATOR
for(i=31;i>=0;i--)
{
b = read_one_bit_from_file();
if(b == 1)
NOMINATOR = NOMINATOR | (1<
}
//Read 32 bits for DENOMINATOR
for(i=31;i>=0;i--)
{
b = read_one_bit_from_file();
if(b == 1)
DENOMINATOR = DENOMINATOR | (1<
}
x = NOMINATOR;
y = DENOMINATOR;
array_fraction[0] = '\0';
do
{
magnitude = (long double)x / (long double)y;
if( magnitude > 1.0 )
{
x = x - y;
strcat(array_fraction,"1");
}
else if( magnitude < 1.0 )
{
y = y - x;
strcat(array_fraction,"0");
}
}
while(x!=y);
valid_length = strlen(array_fraction);
array_fraction[valid_length] = '\0';
strrev(array_fraction);
valid_length = strlen(array_fraction);
Write_In_File(array_fraction);
array_fraction[0] = '\0';
read_from_file();
}
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
void read_from_file()
{
if(!feof(file_in))
{
bit = read_one_bit_from_file();
if(bit == 0)
{
write_byte_sequence();
}
else
{
write_byte_sequence_from_fraction();
}
}
}
/**********************************************************/
void main(void)
{
file_in = fopen("rez.txt","rb");
file_out = fopen("decompressed.txt","wb");
fseek(file_in, 0L, SEEK_END);
file_in_size = ftell(file_in);
fseek(file_in, 0L, SEEK_SET);
printf(" CWT DEcompress: using the Calkin-Wilf Tree\n");
printf("============ Author: BOCUT Adrian Sebastian, 13.07.2010, Ver.1.0 ============\n");
printf("\n");
printf("Start...");
read_from_file();
fflush(file_out);
fclose(file_in);
fclose(file_out);
printf("END...");
getche();
}
