Friday, July 23, 2010

Calkin Wilf Compress

Starting ideea


The start idea comes from the Stern-Brocot tree which contains fractions.
I used this tree and put on it's leafs, binary labels.
With this new structure I observed that on the exterior of the tree I obtain fractions with small values for nominator and denominator which has a longer binary way formed by labels (for example the fraction 1/100 has 100 of 0).
I propose to use these codes and use instead of them, the obtained Stern-Brocot fraction.


The Motivation of the proposed Algorithm


The proposed algorithm uses the Calkin-Wilf tree instead of Stern-Brocot tree due to the following reasons:
- in the decompression part the fractions can be produced faster by applying in an inverse way the construction algorithm: taking into consideration a fraction a/b, if it's value is bigger than 1 then the fraction from the root node contains the value (a-b)/b and otherwise, the root fraction will be a/(b-a).


The steps of the proposed algorithm


- reading sequentially the binary representations of the ASCII codes of the input chars
- the construction with these bits of the road in the Calkin-Wilf tree
- in case that is obtained a fraction with nominator or denominator which can be represent on 32 bits, than:
- if the length of the way in the tree is longer than 64 than in the compressed file is written the fraction
- otherwise the original bits are copied in the compressed file


Examples of the results obtained on some classic files



    

Lena original            Lena negative        Lena black-white    Lena grayscale          Lena odd lines 


                                                
File name
Original Size (Bytes)

Number of Gained BITS
Lena original (gray)
786.486
- 22.416
Lena negativ
786.486
- 22.472
Lena 512 colors
786.486
- 1.306
Lena black-white
32.830
+ 74.369
Lena grayscale
263.222
- 12.765
Lena – odd lines
496.694
+ 2.124.584 / 8 = 265.573 Bytes


Examples of the algorithm on some classic files




File name
Original size (Bytes)

Number of GAINED BITS
RAR archive size
Notepad.exe
Version 5.1 (Build 2600.xpsp_sp2_rtm.040803-2158: SP2)
69.120
73.371 / 8 = 9171 bytes
33.530 Bytes
Regedit.exe
Version 5.1 (Build 2600.xpsp_sp2_rtm.040803-2158: SP2)
146.432
95.412 / 8 = 11.926 bytes
53.424 Bytes
TASKMAN.exe
15.360
15.696 / 8 = 1962 bytes
7.169 Bytes
Winhelp.exe
256.192
-13.358
125.426 Bytes
Twain.dll
Version 1.7.0.0
94.784
+6.120
36.941 Bytes
Firefox.exe
Version 1.9.2.3828, Product version 3.6.6
910.296
2.065.515 / 8 = 258.189 bytes
153.137 Bytes

Remarks

Running for many times the algorithm, I observed the following:
- the proposed algo do not realize all the time a compression of a file
- from the file that can be compressed better, there are: black-white pictures, exe file or files that contains many NUL chars or many chars with ASCII code 0xFF.

It is interesting to study if such tree can be generating for obtaining as much as many reductible fractions for using this proposed algo and combining it with others compression algorithms.

The next version of code will contain instead of copying the original bits, an classic compression method.


BETA Source code for the Proposed COMPRESSION Algorithm



# include "conio.h"
# include "stdio.h"
# include "string.h"

# define  MAXVALUE 0x7FFFFFFF

typedef unsigned long int UL_int;

FILE * pf,* rez_aux, * rez;

UL_int NOMINATOR, DENOMINATOR;
UL_int ch_afis = 0;
UL_int in_size;
UL_int nCHAR_read = 0;
UL_int tc_length = 0;
UL_int number_ch_written = 0;

long c = -1;
long GAIN = 0;

int ct8 = 0;
int i = 0;
int rb;
int tcc = 1;
int tpc = 1;
int write_char = 0;
int ch_read;//The ASCII code of the character

char bin_asc[9];
char sir_length[33];
char FRAGMENT_CHAR[(MAXVALUE>>1)];
char write_contor = 0;

/*----------------------------------------------------*/

void Write_In_File(char * sir)
{
UL_int k_var;
UL_int length = 0;
length = strlen(sir);
for(k_var=0;k_var
{
write_char = write_char * 2;
if(sir[k_var] == '1')
{
write_char = write_char + 1;
}
write_contor++;
if((write_contor>0)&&((write_contor % 8)==0))
{
fputc(write_char,rez);
write_contor = 0;
write_char = 0;
number_ch_written ++;
}
    
}
}

void write_progress()
{
ch_afis ++;
if( (ch_afis % 1000 ) == 0 )
{
printf("\b");
switch( (ch_afis / 1000) % 4)
{
case 0: printf("-"); break;
case 1: printf("\\"); break;
case 2: printf("|"); break;
case 3: printf("/"); break;
}
printf("%7.3f\%%",(((double)nCHAR_read/in_size)*100));
printf("\b\b\b\b\b\b\b\b");
}
}

void Write_Original_Bits()
{
fprintf(rez_aux,"#0 LENGTH = %28lu", tc_length);
fprintf(rez_aux," GAIN = %10ld",GAIN);
fprintf(rez_aux," TOTAL = %10lu\n",8*in_size);
Write_In_File(FRAGMENT_CHAR);
}


void Write_Fractions(UL_int NOM, UL_int DENOM)
{
int k;
//Write the "bookmark"
Write_In_File("1");
for(k=31;k>=0;k--)
if( ((NOM & (1<> k) == 1 )
Write_In_File("1");
else if( ((NOM & (1<> k) == 0 )
Write_In_File("0");
for(k=31;k>=0;k--)
if( ((DENOM & (1<>k) == 1  )
Write_In_File("1");
else if( ((DENOM & (1<>k) == 0  )
Write_In_File("0");
}

/*----------------------------------------------------------------------*/
/* Function for the generating the "Calkin-Wilf" tree of "c" level */
/*----------------------------------------------------------------------*/
void CW_tree()
{
Again:;
 
 c = c+1;
 
 if( !feof(pf) )
 {
 
 //The NOMINATOR or DENOMINATOR has 32 bits
 if( (NOMINATOR>MAXVALUE )||( DENOMINATOR>MAXVALUE) )
 {  
 //The fractions should be written in the file
 if( c > 64 ) 
 {
 tcc = 1;  
 
 if((tcc!=tpc))
 {
 //Write the "bookmark"
 Write_In_File("0");
 
 tc_length = strlen(FRAGMENT_CHAR) - c;
 
 sir_length[0] = '\0';
 
 for(int i=31;i>=0;i--)
 if(((tc_length&(1<> i) == 1)
 sir_length[31-i] = '1';
 else if(((tc_length&(1<> i) == 0)
 sir_length[31-i] = '0';
 
 sir_length[32] = '\0';  
 
 GAIN = GAIN - 33;
 
 Write_In_File(sir_length);
 sir_length[0] = '\0';
 
 //Modify here the FRAGMENT_CHAR: shorter with "c" position
 FRAGMENT_CHAR[tc_length] = '\0';
 
 Write_Original_Bits();
 
 }
 
 FRAGMENT_CHAR[0] = '\0';
 
 Write_Fractions(NOMINATOR,DENOMINATOR);
 
 GAIN = GAIN + c - 65;
 
 fprintf(rez_aux,"#1 %10lu/%10lu",NOMINATOR,DENOMINATOR);
 fprintf(rez_aux," H_Tree = %5ld ",c);
 fprintf(rez_aux," GAIN = %10ld",GAIN);
 fprintf(rez_aux," TOTAL = %10lu\n",8*in_size);
 
 }
 //The ORIGINAL bits should be copied in the file
 else 
 {
 tcc = 0;
 }
 
 tpc = tcc;
 
 NOMINATOR     =  1;
 DENOMINATOR   =  1;
 c             = -1;
 
 
 }
 //END of "The DENOMINATOR or DENOMINATOR has 32 bits"
 
 
 //******************************************************
 //should read a new character from the file
 if( ct8 == 0 ) 
 {
 ch_read=fgetc(pf);
 
 nCHAR_read ++;
 write_progress();
 
 for(int i=7;i>=0;i--)
 if(((ch_read&(1<> i) == 1)
 bin_asc[i] = '1';
 else if(((ch_read&(1<> i) == 0)
 bin_asc[i] = '0';
 
 bin_asc[8] = '\0';  
 }
 
 if(bin_asc[ct8] == '0')
 {
 DENOMINATOR   = NOMINATOR + DENOMINATOR;
 
              ct8 ++;
              ct8 = ct8 % 8;
 
              strcat(FRAGMENT_CHAR,"0");
 
 goto Again;
 }
 else if(bin_asc[ct8] == '1')
 {
 NOMINATOR = NOMINATOR + DENOMINATOR;
 
              ct8 ++;
              ct8 = ct8 % 8;
 
 strcat(FRAGMENT_CHAR,"1");
 
 goto Again;
 }
 
        }// if( !feof(pf) )
}

void main(void)
{
printf("       CWT Compress: using the Calkin-Wilf Tree\n");
printf("============ Author: BOCUT Adrian Sebastian, 13.07.2010, Ver.1.0 ============\n");
printf("\n");
printf("Start...");
FRAGMENT_CHAR[0] = '\0';
pf = fopen("in.txt","rb");
rez_aux = fopen("rez_aux.txt","wb");
rez = fopen("rez.txt","wb");
fseek(pf, 0L, SEEK_END);
in_size = ftell(pf);
fseek(pf, 0L, SEEK_SET);
ct8 = 0;
/* The root is treated separatly */
NOMINATOR= 1;
DENOMINATOR = 1;
/* Generation of the CALKIN-WILF tree fractions */
//This is the level of the current fraction in the tree
c= -1;
CW_tree();
if(FRAGMENT_CHAR[0]!='\0')
{
Write_In_File("0");
tc_length = strlen(FRAGMENT_CHAR);
for(int i=31;i>=0;i--)
if(((tc_length&(1<>i) == 1)
sir_length[i] = '1';
else if(((tc_length&(1<>i) == 0)
sir_length[i] = '0';
sir_length[32] = '\0';  
GAIN = GAIN - 33;
Write_In_File(sir_length);
Write_Original_Bits();
//remaining bits
rb = write_contor;
for(i=0;i<8-rb;i++)
Write_In_File("0");
fprintf(rez,"%c%c",rb,rb);
number_ch_written ++;
number_ch_written ++;
nCHAR_read = in_size;
ch_afis = 1000-1;
write_progress();
FRAGMENT_CHAR[0] = '\0';
}
fprintf(rez_aux,"#0 GAIN = %10ld",GAIN);
fprintf(rez_aux," TOTAL = %10lu\n",8*in_size);
    fprintf(rez_aux,"\n\nNumber of read chars = %lu\n",nCHAR_read);
    fprintf(rez_aux,"\n\nNumber of written chars = %lu\n",number_ch_written);
printf("\n\nNumber of saved bits = %ld\n",GAIN);
    printf("\n\nNumber of read chars = %lu\n",nCHAR_read);
    printf("\n\nNumber of written chars = %lu\n",number_ch_written);
    printf("\b\nEND...\n\a");
fflush(rez);
fclose(rez);
fclose(rez_aux);
fclose(pf);
getche();
}

BETA Source code for the Proposed DE-COMPRESSION Algorithm



# include "stdio.h"
# include "conio.h"
# include "string.h"

# define  MAXVALUE 0x7FFFFFFF

typedef unsigned long int UL_int;

FILE * file_in, * file_out;

UL_int ch_write_progress = 0;
UL_int nCHAR_read = 0;
UL_int file_in_size;
UL_int NOMINATOR = 0, DENOMINATOR = 0;
UL_int x,y;
int write_char = 0;
int new_char = 0;
int aux_new_char = 0;

char ch_read = 0;
char bit = 0;
char contor_read_bits = 0;
char array_fraction[1000000];
char write_contor = 0;

int first = 0;


/**********************************************************/
void Write_In_File(char * array)
{
UL_int length = 0;
int i,j;

length = strlen(array);

for(i=0;i
{
write_char = write_char * 2;

if(array[i] == '1')
write_char = write_char + 1;

write_contor ++;

if((write_contor % 8) == 0)
{
//write_char = write_char >> 1;
new_char = 0;
for(j=0;j<8;j++)
{
new_char  = new_char * 2;

if((write_char % 2) == 1)
     new_char = new_char + 1;
//new_char  <<= 1;
write_char /= 2;
}
//new_char  >>= 1;

fputc(new_char,file_out);
fflush(file_out);

aux_new_char = new_char;

new_char = 0;
write_contor = 0;
write_char = 0;
}
}

}

void read_from_file();

void write_progress()
{
ch_write_progress ++;
if( (ch_write_progress % 2 ) == 0 )
{
printf("\b");
switch( (ch_write_progress / 1000) % 4)
{
case 0: printf("-"); break;
case 1: printf("\\"); break;
case 2: printf("|"); break;
case 3: printf("/"); break;
}
printf("%7.3f\%%",(((double)nCHAR_read/file_in_size)*100));
printf("\b\b\b\b\b\b\b\b");
}
}

char read_one_bit_from_file()
{
char b = 0;
//Test if a new character should be read from the file
if(contor_read_bits == 0)
{
if(!feof(file_in))
ch_read = fgetc(file_in);
nCHAR_read ++;
write_progress();
}
//Provide the current bit
b = (ch_read & (1<<(7-contor_read_bits))) >> (7-contor_read_bits); 
contor_read_bits ++;
contor_read_bits = contor_read_bits % 8;
return b;
}

void write_byte_sequence()
{
UL_int length;
UL_int k;
char i,b;
length = 0;
//Read 32 bits for length
for(i=31;i>=0;i--)
{
b = read_one_bit_from_file();
if(b == 1)
length = length | (1<
}
//Read from the file, "length" bytes
for(k=0;k
{
b = read_one_bit_from_file();
if(b == 1)
Write_In_File("1");
else
Write_In_File("0");
}

read_from_file();
}

void write_byte_sequence_from_fraction()
{
UL_int valid_length;
long double magnitude = 0.0;
char i,b;
NOMINATOR = 0;
DENOMINATOR = 0;
//Read 32 bits for NOMINATOR
for(i=31;i>=0;i--)
{
b = read_one_bit_from_file();
if(b == 1)
NOMINATOR = NOMINATOR | (1<
}
//Read 32 bits for DENOMINATOR
for(i=31;i>=0;i--)
{
b = read_one_bit_from_file();
if(b == 1)
DENOMINATOR = DENOMINATOR | (1<
}
x = NOMINATOR;
y = DENOMINATOR;
    array_fraction[0] = '\0';
    do
{
magnitude = (long double)x / (long double)y;
if( magnitude > 1.0 )
{
x = x - y;
strcat(array_fraction,"1");
}
else if( magnitude < 1.0 )
{
y = y - x;
strcat(array_fraction,"0");
}
while(x!=y);

valid_length = strlen(array_fraction);
array_fraction[valid_length] = '\0';
strrev(array_fraction);

valid_length = strlen(array_fraction);
Write_In_File(array_fraction);
array_fraction[0] = '\0';

read_from_file();
}

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

void read_from_file()
{
if(!feof(file_in))
{
bit = read_one_bit_from_file();
if(bit == 0)
{
write_byte_sequence();
}
else
{
write_byte_sequence_from_fraction();
}
}
}


/**********************************************************/

void main(void)
{
file_in  = fopen("rez.txt","rb");
file_out = fopen("decompressed.txt","wb");
fseek(file_in, 0L, SEEK_END);
file_in_size = ftell(file_in);
fseek(file_in, 0L, SEEK_SET);
printf("     CWT DEcompress: using the Calkin-Wilf Tree\n");
printf("============ Author: BOCUT Adrian Sebastian, 13.07.2010, Ver.1.0 ============\n");
printf("\n");
printf("Start...");
read_from_file();
fflush(file_out);
fclose(file_in);
fclose(file_out);
printf("END...");
getche();
}



No comments:

Post a Comment