package ReDIF::Parser::Input;

##  Copyright (c) 1997-2001 Ivan Kurmanov. All rights reserved.
##
##  This code is free software; you can redistribute it and/or modify it
##  under the same terms as Perl itself.

#$VERSION = do { my @r=(q$Revision$=~/\d+/g); sprintf "%d."."%02d"x$#r,@r }; 
#                       $Id$

local $/;

use strict;

use ReDIF::Unicode qw( &has_utf8_bom &utf8_from_latin1 &has_utf16_bom );
use Encode;

use vars qw( 
             $BUFFER
             $CURRENT_BUFFER_POSITION
             $CURRENT_FILE_POSITION
             $LINE_NUMBER
             $SOURCE_ENCODING 
             $PARSER_MODULE 
             $FILENAME 
             $Options 
             %TEMPLATE_STARTERS
             $TEMPLATE_CHECKSUM
             @LINES
             );

%TEMPLATE_STARTERS = (
                      'template-type' => 1,
                      );

use constant DEBUG => 0;

$BUFFER = ' ' x 3000;
my $CHUNK_SIZE = 400; 

my $LINE_ENDING ;

my $THE_LINE = ' 'x 2000;
my $THE_LINE_ORIGINAL = ' 'x 2000;
my $THE_LINE_FILE_POSITION;
my $THE_LINE_BUFFER_POSITION;
my $eol_length;
my $EOF_STATUS;
my $ATTRIBUTE;
my $PARSING_STRING =0;
# my $THE_LINE_LENGTH;  ### not used anymore!


###  MD5 checksum calculations for templates

use ReDIF::setup_conf;
if ( ReDIF::Parser::calculate_md5_checksum ) {
    require Digest::MD5;
    $TEMPLATE_CHECKSUM = Digest::MD5 -> new();
}

sub get_line_ending { return $LINE_ENDING ; }
sub get_the_line    { return $THE_LINE    ; }
# sub get_the_line_length { return $THE_LINE_LENGTH ; }


#######   sub  I N I T   ##################################################

sub init {
    my $opt = shift;

    if( defined $opt ) {
        $Options = $opt;

    } else {
        $Options = {};
    }
}

#######   sub  S T A R T   F I L E   ######################################

sub start_file {
    my $filename = shift;
    my $pos = shift;

    if ( (not -e $filename)
         or (not -r _ ) 
         or (not -f _ ) 
       ) {
      return undef;
    }
    $PARSING_STRING = 0;
    $FILENAME = $filename;

    ####  A check for non-standard line-termination
    if ( fileno FILE ) { close FILE; }
    if ( not open( FILE, '<', $filename ) ) {
      warn "open failed: $filename";
    } else { undef $!; }
    binmode FILE;
    if ( not defined read( FILE, $BUFFER, $CHUNK_SIZE ) ) {
      warn "read failed: $! ($filename)";
    } else { undef $!; }

    if ( $BUFFER =~ /\n/ )    { $LINE_ENDING = "\n"; }
    elsif ( $BUFFER =~ /\r/ ) { $LINE_ENDING = "\r"; }
    else { $LINE_ENDING = "\n"; }
        
    $CURRENT_BUFFER_POSITION = 0;
    $CURRENT_FILE_POSITION   = 0;
    
    if( $pos ) {
        $FILENAME .= " (starting at $pos)";
        undef $LINE_NUMBER;
    } else {
        $pos = 0;
    }
    if ( $! ) { warn "Sys: $!"; undef $!; }


    ###  UNICODE
    if ( has_utf8_bom $BUFFER ) {
        $SOURCE_ENCODING = "utf-8";
        $CURRENT_BUFFER_POSITION = 3;
        $pos = 3;
        binmode FILE, ":utf8";
        undef $!;
    } elsif ( has_utf16_bom $BUFFER ) {
        my $endianness = has_utf16_bom( $BUFFER );
        $SOURCE_ENCODING = "UTF-16$endianness";
        $CURRENT_BUFFER_POSITION = 2;
        $pos = 2;
        binmode FILE, ":encoding($SOURCE_ENCODING)";
        undef $!;
    } else {
        $SOURCE_ENCODING = "latin1";   
    }
    ####  

    $LINE_NUMBER = 0;
    seek (FILE, $pos, 0);
    $CURRENT_FILE_POSITION = $pos;
    $BUFFER = '' ;

    undef $THE_LINE;
    $EOF_STATUS = 0;
    
    return 1;
}


#######   sub  S T A R T   S T R I N G    ################################

sub start_string {
  my $string = shift;
  
  close FILE;
  
  $PARSING_STRING = 1;
  $FILENAME = "buffer string";
  $BUFFER = $string;
  
  ###  UNICODE
  
  if ( has_utf8_bom $BUFFER ) {
    $SOURCE_ENCODING = "utf-8";
    substr( $BUFFER, 0, 3 ) = '';
    $BUFFER = Encode::decode( 'utf-8', $BUFFER );
    
  } elsif ( has_utf16_bom $BUFFER ) {
    $SOURCE_ENCODING = "UTF-16" . has_utf16_bom( $BUFFER );
    substr( $BUFFER, 0, 2 ) = '';
    $BUFFER = Encode::decode( $SOURCE_ENCODING, $BUFFER );
    
  } elsif ( Encode::is_utf8( $BUFFER ) ) {
    $SOURCE_ENCODING = "utf-8";
    
  } else {
    $SOURCE_ENCODING = "latin1";   
  }
  ####  

  @LINES = split( /\n/, $BUFFER );
  $CURRENT_BUFFER_POSITION = 0;
  $LINE_NUMBER = 0;
  undef $THE_LINE;
  $EOF_STATUS = 0;
    
  return 1;
}


#######   sub  E X T R A C T   N E X T   L I N E    ########################

sub extract_next_line {

    if ( not $PARSING_STRING ) {
        
      $THE_LINE_FILE_POSITION = tell( FILE );
      local $/ = $LINE_ENDING;
      eval {
        $THE_LINE_ORIGINAL = $THE_LINE = <FILE>;
      };
      if ($@) {
        my $pos = tell( FILE );
        my $err = $@;
        $err =~ s! at [^\s]+ReDIF/Parser/Input.pm line \d+.*\n?!!g;
        warn "$err ($FILENAME, at $pos or further)\n";
        undef $THE_LINE;
        undef $THE_LINE_ORIGINAL;
      }
      if ( $! ) { undef $!; }
      
      if( not defined $THE_LINE ) {
        $EOF_STATUS = 1;
        return undef;
      }

        chomp $THE_LINE_ORIGINAL;
        $THE_LINE =~ s/\s+$//g;

    } else {

        undef $THE_LINE_FILE_POSITION;
        $THE_LINE_ORIGINAL = $THE_LINE = shift @LINES;

        if ( not defined $THE_LINE ) {
            $EOF_STATUS = 1;
            return undef;
        }

        $THE_LINE =~ s/\s+$//g;
        my $the_line_length       = length( $THE_LINE );
        $THE_LINE_FILE_POSITION   = $CURRENT_BUFFER_POSITION;
        $CURRENT_BUFFER_POSITION += $the_line_length + 1;
    }

    if ( defined $LINE_NUMBER ) { $LINE_NUMBER++; }
    
    ###   UNICODE
    
    if ( ( $SOURCE_ENCODING eq 'latin1' )  and $THE_LINE ) { 
        $THE_LINE = utf8_from_latin1 ( $THE_LINE );

        if ( $ReDIF::Parser::Core::Options ->{utf8_output} ) {
        
        $THE_LINE =~ 
#    1   2   3   4   5   6   7   8   9  10  11  12                                  5  
tr/\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C/\x{20AC}\x{201A}\x{192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}/d;

        $THE_LINE =~ 
tr/\x8E\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9E\x9F\x81\x8D\x8F\x90\x9D/\x{017D}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{017E}\x{0178}/d;
      }
    } 
      
      
    
    return 1;
}


#######   sub  O U T P U T   T H E   L I N E   ############################

sub output_the_line {
    
    if ( defined $Options->{output_source} ) {
        $Options->{output_source}-> add_data( "> $THE_LINE_ORIGINAL" );
    } 

    if ( defined $Options->{output_source_utf8} ) {
        $Options->{output_source_utf8}-> add_data( "> $THE_LINE" );
    }    

    if( ReDIF::Parser::calculate_md5_checksum ) {
        if ( $THE_LINE ) {
            $TEMPLATE_CHECKSUM -> add( Encode::encode_utf8($THE_LINE) );
            $TEMPLATE_CHECKSUM -> add( "\n" );
        }
    }

}

#######   sub  P R O C E S S   L I N E    #############

sub process_line {

  my( $file_position, $line_number, $attribute_flag );

  $file_position = $THE_LINE_FILE_POSITION;
  $line_number   = $LINE_NUMBER;

  $attribute_flag = 1;

  output_the_line();
  undef $THE_LINE;

  return( $file_position, $line_number, $attribute_flag );

}

#######   sub  E X T R A C T   N E X T   A T T R I B U T E    #############

# SK, 2010-09-18
# Fix for Perl 5.12, replace goto PROCESS with calls to
# sub process_line, much cleaner
#

use vars qw( $next_attribute $next_value );

sub extract_next_attribute {

    undef $ATTRIBUTE;

    my   $attribute_flag ;
    my ( $attribute, $value, $file_position, $line_number );

    my $close_template;

    if ( defined $next_attribute ) {
        $attribute = $next_attribute;
        $value     = $next_value;
        
        undef $next_attribute ;
       
#        goto PROCESS;

       ( $file_position, $line_number, $attribute_flag ) = process_line();
        if ( $EOF_STATUS ) {
            $close_template = 1;
            goto FINISH;
        }

    } 

    while ( 1 ) {

        GET_NEXT_LINE2:
        if ( not defined $THE_LINE ) {

            #  get next line of data
          GET_NEXT_LINE:
            extract_next_line() ;  

            if ( DEBUG ) {
                print " L\{ $THE_LINE \} ($LINE_NUMBER)\n";
            }
        }

        if ( not defined $THE_LINE ) {
            if ( $EOF_STATUS ) { 
                if ( DEBUG ){ print " {EOF} "; } 
                $close_template = 1; 
                last;
            }
            die;
        }
        
        if ( not $THE_LINE ) {
            output_the_line();
            undef $THE_LINE;
            if ( $EOF_STATUS ) { 
                if( DEBUG ){ print " {EOF} "; } 
                $close_template = 1; 
                last;
            }
            goto GET_NEXT_LINE2;
        }

        if ( $THE_LINE =~ /^#/ ) {
            undef $THE_LINE;
            goto GET_NEXT_LINE2;
        }

        if ( $THE_LINE =~ /^\x1a$/ ) {   ### single ^Z on a line ignored
            undef $THE_LINE;
            goto GET_NEXT_LINE2;
        }

        my ( $a, $v ) = ( $THE_LINE =~ /^([a-zA-Z\-]+):\s*(.*)/ );

        if ( defined $a ) {

            $a = lc $a;

            if ( $attribute_flag ) {

                ###  Check for "template-type" attribute or equivalent

                if ( $TEMPLATE_STARTERS{ $a } ) {
                    $close_template = 1;
                }

                ###  then its a last attribute of 
                ###  the previously read template
                
                #    send the attribute!
                $next_attribute = $a;
                $next_value     = $v;
                last;
            } 
            
            
            $attribute = $a;
            $value     = $v;
            
          PROCESS:
            
            ( $file_position, $line_number, $attribute_flag ) = process_line();
            
        } else {
            
            if ( $attribute_flag    ) {
                $value .= "\n";
                $value .= $THE_LINE;
                output_the_line();
                undef $THE_LINE;
                
            } else {
                
                $attribute  = undef;
                $value      = $THE_LINE;
                ( $file_position, $line_number, $attribute_flag ) = process_line();
                last;
            }
            
        }
        
        if ( $EOF_STATUS ) {
            $close_template = 1;
            last;
        }
        
    }  ### while (1) loop upon the data lines, the end of
    
    FINISH:
    
    my $attr_out = $Options->{attribute_output};

    if ( defined $attr_out ) {
        if ( $attribute_flag ) {            
            warn "sending '" . $attribute . "' ..."
                if DEBUG;

            if ( $Options->{'remove_newline_from_values'} ) {
                $value =~ s/\s+/ /g;
            }

            ### FLUSH
            $attr_out->the_attribute( $attribute, 
                                      $value, 
                                      $file_position,
                                      $line_number
                                      );

            if ( defined $Options->{output_source} ) {
                $Options->{output_source}     -> flush_data_buffer ();
            } 
            if ( defined $Options->{output_source_utf8} ) {
                $Options->{output_source_utf8}-> flush_data_buffer ();
            }    

        } else {
            return undef;
        }

        if ( $close_template ) {
            
            warn "closing template (right after $attribute)"
                if DEBUG;
            if ( ReDIF::Parser::calculate_md5_checksum ) {
                my $digest = $TEMPLATE_CHECKSUM->b64digest;
                $attr_out-> close_current_template ( $digest );
#               if( DEBUG ) {
#                   print "MD5: " , $digest, "\n";
#               }
            } else {
                $attr_out-> close_current_template ( 0 );
            }
        }
        
        return 1;
    }

    if ( not $attribute_flag ) {
        return undef;
    }

    return 1;
}


#######   sub  P R O C E S S   F I L E   ###################################

sub process_file {
    my $filename = shift;
    my $position = shift; 
    my $eof_reached = 0;

    my @LINES;

    start_file ( $filename, $position ) or return undef;
    
    while ( 1 ) {
        while ( 1 ) {
            my $got_line = extract_next_line ( );
            my $pos = $THE_LINE_FILE_POSITION;
            if ( not $got_line ) {
                last;
            }
            my $line = { text => $THE_LINE       , 
                         pos  => $pos
                         };

            push @LINES, $line;
        }
        if ( $EOF_STATUS ) { last; }
    }
    
    close FILE;
    return( $EOF_STATUS,  [ @LINES ] );
}


#######   sub  S E L F   T E S T   E X T R A C T    N E X T   L I N E   #######

sub self_test_extract_next_line {
    my $file_name = shift;
    my $enc = shift;
    my $line_lengths = shift;

    my @results ;
    
    start_file( $file_name );
    push @results, 
        $LINE_ENDING, 
        defined( $CURRENT_BUFFER_POSITION ), 
        defined( $CURRENT_FILE_POSITION ), 
        defined( $SOURCE_ENCODING ), 
}


# ---------------------------------------------------------------------------

1;


__END__ 

            ##################################################################
            ### INLINED extract_next_line() FUNCTION: START

            if( not $PARSING_STRING ) {
        
                $THE_LINE_FILE_POSITION = tell( FILE );
                $THE_LINE_ORIGINAL = $THE_LINE = <FILE>;
                
                if( not defined $THE_LINE ) {
                    $EOF_STATUS = 1;
                    $close_template = 1;
                    last;
                    return undef;
                }

                chomp $THE_LINE_ORIGINAL;
                $THE_LINE =~ s/\s+$//g;

            } else {

                $THE_LINE_ORIGINAL = $THE_LINE = shift @LINES;

                if( not defined $THE_LINE ) {
                    $EOF_STATUS = 1;
                    $close_template = 1;
                    last;
                    
                    return undef;
                }
                
                $THE_LINE =~ s/\s+$//g;
                my $the_line_length       = length( $THE_LINE );
                $THE_LINE_FILE_POSITION   = $CURRENT_BUFFER_POSITION;
                $CURRENT_BUFFER_POSITION += $the_line_length + 1;
            }

            if( defined $LINE_NUMBER ) { $LINE_NUMBER++; }
    
            ###   UNICODE
            
            if( ( $SOURCE_ENCODING eq 'latin1' )  and $THE_LINE ) { 
                $THE_LINE = utf8_from_latin1 ( $THE_LINE );
            }

            ### INLINED extract_next_line() FUNCTION: END
            ##################################################################










