package Exactocr;

########################################################################
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; either version 2 of the License, or    #
# (at your option) any later version.                                  #
#                                                                      #
# This program is distributed in the hope that it will be useful,      #
# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the         #
# GNU General Public License for more details at http://www.gnu.org    #
########################################################################

#============================================================================
# This Module searches for a group of pixels that are connected together and thus should
# represent one character. The input txt array must contain set of columns representing
# pixel columns of picture with text.
#============================================================================

use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(&find_char &check_ocr_database %ocr_database $ocr_key);
$VERSION = 0.1;

%ocr_database = ();
#def ocr_database - hash array containing recognized patterns and their association - char(s) assigned by user
$ocr_key = 0;
#def $ocr_key - contains number generated out of pixels of recognized pattern used as key in the hash %ocr_database

#=====================================================================================================================
#input:  text array containing PGM subtitle converted to text; starting column; text height in pixels;
#    	 space width in pixels; diacritics distance in pixels; max overlaping of chars in pixels; chars splitting(1|0);
#	 empty pixel txt representation; filled pixel txt representation
#output: text array containing recognized char/group of chars; ending column; space detected 1/0
sub find_char {
    local ( $ocr_column, $height, $space, $diacritics, $overlap, $split, $empty, $pixel, @pgm_txt_sub ) = @_;

    @char_pix_columns = ();
    $prev_ocr_column = $ocr_column;
    $empty_columns = 0;
    $space_detected = 0;
    $error_log = 0;

    if ( index($pgm_txt_sub[$ocr_column],$pixel) < 0 ) {

#	++++++++++++++++  empty pix column - detect space  ++++++++++++++++++++
	while ( index($pgm_txt_sub[$ocr_column],$pixel) < 0 and $ocr_column < @pgm_txt_sub ) {

	    $empty_columns ++;
	    $ocr_column ++;

	}

	if ( $empty_columns >= $space ) { $space_detected = 1; }

    } 

    if ( index($pgm_txt_sub[$ocr_column],$pixel) >= 0 and $ocr_column < @pgm_txt_sub ) {

	if ( $ocr_column > 0 ) {

#	    ++++++++++++++  return back and look for the very beginning of the character  +++++++++++++++++
	    $right_pix_col = $pgm_txt_sub[$ocr_column];
	    $left_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column-1],$right_pix_col);

	    $move_back = 1;

	    while ( (index($left_pix_col,$pixel) >= 0) and ($ocr_column-1 > 0 ) ) {

		last if ( $move_back > 4*$overlap ); # value 4 estimated

		$ocr_column--;
		$move_back++;
		$right_pix_col = $left_pix_col;
		$left_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column-1],$right_pix_col,1);

	    }

	    if ( $move_back > $overlap+1 ) { $ocr_column += $move_back; }
	    
	} else {

	    $right_pix_col = $pgm_txt_sub[$ocr_column];

	}

	$left_pix_col = $right_pix_col;
	$right_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column+1],$left_pix_col);

#	+++++++++++++++  moves forward and look for the end of the character  +++++++++++++++++
	
	while ( (index($right_pix_col,$pixel) >= 0) and ($ocr_column+2 < @pgm_txt_sub) ) {

	    push(@char_pix_columns,$left_pix_col);
	    $ocr_column++;
	    $left_pix_col = $right_pix_col;
	    $right_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column+1],$left_pix_col);

	}

	push(@char_pix_columns,$left_pix_col);

#	+++++++++++++++  check end of the subtitle text row  ++++++++++++++++
	if ( $ocr_column+2 >= @pgm_txt_sub ) {

	    if ( index($right_pix_col,$pixel) >= 0 ) {
		push(@char_pix_columns,$right_pix_col);
	    }
	    $ocr_column++;

	}

	$char_pix_width = @char_pix_columns;

#	+++++++++++++++  handle diacritics  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	if ( $char_pix_width > 2*$overlap ) {

#	    +++++++  kepp the middle part of the character unchanged incl. diacritics +++++++++

	    for ( $i = $overlap; $i < ($char_pix_width - $overlap); $i++ ) {
		$char_pix_columns[$i] = $pgm_txt_sub[$ocr_column-$char_pix_width+$i+1];
	    }

#	    +++++++++++++++  handle again borders of the character  +++++++++++++

#	    +++++++++++++++  left border to the left  +++++++++++++++++++++++++++

	    for ( $i = $overlap-1; $i >= 0; $i-- ) {
		$char_pix_columns[$i] = &modify_pix_col($pgm_txt_sub[$ocr_column-$char_pix_width+$i+1],$char_pix_columns[$i+1]);
	    }

#	    +++++++++++++++  right border forward  +++++++++++++++++++++

	    for ( $i = 0; $i < $overlap; $i++ ) {
		pop(@char_pix_columns);  # deletes right border of char
	    }

	    $char_pix_width = @char_pix_columns;

	    $ocr_column -= $overlap-1;
	    $left_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column],$char_pix_columns[$char_pix_width-1]);
	    $right_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column+1],$left_pix_col);

#	    +++++++++++++++  moves forward and look for the end of the character  +++++++++++++++++

	    while ( index($right_pix_col,$pixel) >= 0 and $ocr_column+2 < @pgm_txt_sub ) {

		push(@char_pix_columns,$left_pix_col);
		$ocr_column++;
		$left_pix_col = $right_pix_col;
		$right_pix_col = &modify_pix_col($pgm_txt_sub[$ocr_column+1],$left_pix_col);

	    }

	    push(@char_pix_columns,$left_pix_col);

#	    +++++++++++++++  check end of the subtitle text row  ++++++++++++++++

	    if ( $ocr_column+2 >= @pgm_txt_sub ) {

		if ( index($right_pix_col,$pixel) >= 0 ) {

		    push(@char_pix_columns,$right_pix_col);

		}
		$ocr_column++;

	    }

	    $char_pix_width = @char_pix_columns;

	}

    }

    $char_pix_width = @char_pix_columns;
    
    if ( $char_pix_width > 1 ) {
    
	$last_char_pixel = 0;
	@char_pix_columns = &shift_pix_cols_left(1,@char_pix_columns);
	# parm 1 tells the function to determine the $last_char_pix


	# TRY TO SPLIT THE PATTERN HERE?

    
    }

    $ocr_column++;

    if ( $ocr_column <= $prev_ocr_column ) {
    # this is against the infinite loop due to specific connection of neighbouring chars

	$origin_ocr_column += 1;
	$ocr_column = $origin_ocr_column;
	$error_log = 1;
	print"ERROR - vracim zpet OCR sloupec\n";

    } else {

	$origin_ocr_column = $ocr_column;

    }

    return ($ocr_column,$space_detected,$error_log,@char_pix_columns);

}
#----------------------------------------------------------------------------------------------------


#====================================================================================================
sub split_chars {






























}
#----------------------------------------------------------------------------------------------------


#====================================================================================================
# the core idea of my OCR exact match algorithm - keep only pixel sets that are directly connected
#	to the neighbouring column pixels
#
# o   o   -if the first column is the main, then only 'o' pixels remains, 'x' pixels are lost
# ooooo
# o o 
# o o x
# o  xx
#
sub modify_pix_col {
    my ( $change_col, $main_col, $back ) = @_;
    my ( $first_set, $last_set ) = 255; #set to some max value
    my ( $keep, $index) = 0;
    my ($i );
    
    while ( $index < length($change_col) ) {
    
	# traces pixel column from the bottom to the top - because of the skewness of italics
	
	if ( substr($change_col, $index, 1) eq $pixel ) {
	
	    $first_set = $index;
	    
	    while ( substr($change_col, $index, 1) eq $pixel ) {
		$index++;
	    }
	    $last_set = $index-1;

	    $keep = 0;
	    for ( $i = $first_set; $i <= $last_set; $i++ ) {

		if ( substr($main_col, $i, 1) eq $pixel ) { $keep = 1; }
		last if $keep;

	    }

	    if ( $keep == 0 and $first_set >= ( $max_shift_down+$diacritics ) ) {
		# to keep diacritics
                # can't catch diacritics over u,y,U,Y,N
                # a pro zachovani cary u vykricniku pri kurzive, kdy tecka zacina drive


		for ( $i = $first_set-1 - $diacritics; $i < $first_set; $i++ ) {

		    next if $i<0;
		    if ( substr($change_col, $i, 1) eq $pixel ) { $keep = 1; }
		    last if $keep;

		}

	    }

	    if ( $keep == 0 and $back == 0 and ( index($main_col,$pixel) == rindex($main_col,$pixel) ) and $last_set>length($char_pix_columns[0])/2 ) {
            # vyjimecne u dlouheho i se stava, ze carka zacina pred i, a pak
	    # by chybela cast pod carkou nad i
	    # aby ale nedochazelo k zacykleni pri kurzive, je nutno pridat dalsi
	    # test, kdy predchozi pixel-sloupec obsahuje jen jeden aktivni pixel

		for ( $i = $last_set+1 + $diacritics; $i > $last_set; $i-- ) {

		    last if $i > length($change_col);
		    if ( substr($change_col, $i, 1) eq $pixel ) { $keep = 1; }
		    last if $keep;

		}

	    }

	    unless ( $keep ) {

		for ( $i = $first_set; $i <= $last_set; $i++ ) {

		    substr($change_col, $i, 1, $empty);

		}

	    }

	}

	$index++;

    }

#    print "novy:|$change_col|\n";

    return $change_col;

}
#----------------------------------------------------------------------------------------------------


#====================================================================================================
# shifts the char pixels down to the base
sub shift_pix_cols_left {
    my ( $for_set_print, @set ) = @_;
    my ( $shift_char_down ) = 255; #initial max value
    my ( $first_left, $last_col_pixel );
    my ( $i,$column );
    
    foreach $column ( @set ) {
    
#	print "-$column-\n";
#	print "shift_char_down:$shift_char_down\n";
	$first_left = index($column,$pixel);
#	print "first_left:$first_left\n";
#	print "pixel:\'$pixel\'\n";
	if ( $first_left >= 0 and $first_left < $shift_char_down ) { $shift_char_down = $first_left; }
	
	last if $shift_char_down == 0;
	
    }


    if ( ($shift_char_down <= length($set[0])*0.25 ) and ( $shift_char_down > $max_shift_down ) ) { $max_shift_down = $shift_char_down; }
    # into $max_shift_down can't be included shift of the - and higher placed patterns; 0.25 based on tests
    if ( $shift_char_down > length($set[0])*0.6 ) { $shift_char_down--; }
    # in order to not misrecognise apostrophe and comma, the patterns in the upper half remains one empty bit at the bottom
    
    for ( $i=0; $i < @set; $i++ ) {
    
	$last_col_pixel = rindex($set[$i],$pixel);
	$set[$i] = substr($set[$i],$shift_char_down,$last_col_pixel-$shift_char_down+1);
	if ( (($last_col_pixel-$shift_char_down) > $last_char_pixel ) and $for_set_print ) {
	    $last_char_pixel = $last_col_pixel-$shift_char_down;
	}

    }

    return @set;

}
#----------------------------------------------------------------------------------------------------


#==========================================================================================================
#input:  found pixels set that represents character
#output: char(s) assigned to the particular previously found pixels set - exact match
sub check_ocr_database {

    my ( @pattern ) = @_;
    my ( $recognized ) = "";
    my ( $i );
    my ( $patt, $pattern_match );
    my ( @ocr_database_set );
    my ( $pattern_width );

    $pattern_width = @pattern;
    $ocr_key = &generate_ocr_key(@pattern);

#    print"Entering OCR database check... pattern width:$pattern_width\n";

    if ( exists($ocr_database{$ocr_key}) ) {

#	print"ocr key exists\n";

	foreach $patt ( @{$ocr_database{$ocr_key}} ) {

	    @ocr_database_set = @$patt;

	    if ( @pattern == @ocr_database_set-1 ) {

		$pattern_match = 1;

		for ( $i = 0; $i < $pattern_width; $i++ ) {

#		    print"checking:database:$ocr_database_set[$i+1]\n";
#		    print"checking:pattern :$pattern[$i]\n";
#		    print"checking:pattern_match:$pattern_match\n";

		    if ( $ocr_database_set[$i+1] ne $pattern[$i] ) {
			$pattern_match = 0;
#			print"Patterns doesn't match, leaving...\n";
		    }


		    last unless $pattern_match;

		}

	    } else { $pattern_match = 0; }

	    if ( $pattern_match ) {

		$recognized = ${$patt}[0];

	    }

	    last if $pattern_match;

	}

    }

    return $recognized;

}
#----------------------------------------------------------------------------------------------------------


#==========================================================================================================
sub generate_ocr_key {
    my ( @set ) = @_;
    my ( $multiple ) = 1;
    my ( $pattern, $number );
    my ( $key ) = 0;
    
    foreach $pattern ( @set ) {

	# $number will contain the number of pixels in $pattern
	$number = $pattern =~ s/$pixel//go; #modifier s///o  - reg.exp. is compiled only once

	$key += $multiple * $number;
	$multiple *= 2;

    }

#    print "ocr key:$key\n";
    return $key;

}
#----------------------------------------------------------------------------------------------------------





1;

