#!/usr/bin/perl

use strict;
use warnings;
use open ':utf8';
use XML::LibXML;
use DBI;
use Encode;
use Encode::Guess qw/latin-1/;
use Data::Dumper;
use MySQL::Config qw(parse_defaults);
use LWP::UserAgent;
use File::Temp;
use File::Copy;
use File::Basename;
use File::Path;
use HTTP::Cookies;
use List::Util qw(shuffle);
use IO::File;
use Compress::Zlib;


## files and diretories
my $home_dir=$ENV{'HOME'};
my $data_dir="$home_dir/data";
my $opt_dir="$home_dir/opt";
my $var_dir="$home_dir/var";
my $crossref_dir="$var_dir/crossref";
my $response_file = "$opt_dir/response";
my $junk_file = "$opt_dir/junk";
my $tidy_file = "$opt_dir/tidy.xml";
my $queries_file = "$opt_dir/queries";

## binmode of run
binmode(STDOUT,':utf8');

## url constants
my $labs_url='http://api.labs.crossref.org';
my $sigg_url=$labs_url.'/search?q=';
##my $search_url='http://trabbi.openlib.org/home/krichel';

## run parameters
$|=1;
my $min_wait_queries=60;
my $max_wait_queries=1800;
my $min_wait_results=5;
my $max_wait_results=20;
## can be 0,1,2
my $debug=2;
my $wait_until_renewal=365*24*60*60;
## modes of operation. this can be random, for testing or sequential,
## for normal operation.  this determines whether we are looking at a
## random entry in the queries file or we looking it it sequentially
my $mode;
if($debug>1) {
  $mode='random';
}
else {
  $mode='sequential';
}

## globals
## parser
my $parser = XML::LibXML->new();
$parser->no_network(1);
## user agent
my $ua = LWP::UserAgent->new;
$ua->cookie_jar(HTTP::Cookies->new(file => "$opt_dir/cookies.txt",
                                            autosave => 1));
my @ns_headers = (
                  'User-Agent' => 
                  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.9) Gecko/20100501 Iceweasel/3.5.9 (like Firefox/3.5.9)',
                  'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                  'Accept-Language' => 'en-us,en;q=0.5',
                  'Accept-Encoding' => 'gzip,deflate',
                  'Accept-Charset' => 'utf-8;ISO-8859-1;q=0.7,*;q=0.7',
                  'Keep-Alive' => '300',
                  'Connection' => 'close' );

## main
my $dbh=&open_mysql();


if($ARGV[0]) {
  my $exp=$ARGV[0];  
  $debug=2;
  my $result=&do_query($exp);
  if(not defined($result) or not $result) {
    print "no result";
    next;
  }
  &get_crossref_data($result);
}
else {  
  &do_all_queries();
}



###print &parse_tidy_file;


## tries to parse as latin
sub try_to_read_as_latin_1 {
  my $tidy_file=shift;
  my $tidy_fh = new IO::File;
  $tidy_fh->open("< $tidy_file");
  $tidy_fh->binmode(":iso-8859-1");
  my $doc = eval {$parser->load_html(IO => $tidy_fh,  recover=>1, encoding=>'iso-8859-1')};
  if(not $doc and $@) {
    print "could not parse $tidy_file second time round: $@\n";      
    if($debug > 1) {
      exit;
    }
  }
  return $doc;
}

## parses tidy file
sub parse_tidy_file {
  my $tidy_file=shift;
  if(not defined($tidy_file)) {
    $tidy_file="$opt_dir/tidy.xml";    
  }
  my $tidy_fh = new IO::File;
  $tidy_fh->open("< $tidy_file");
  ## drop all PerlIO layers possibly created by a use open pragma
  binmode $tidy_fh; 
  my $doc = eval {$parser->load_html(IO => $tidy_fh, recover=>1);} ;
  my $error=$@;
  if(not $doc and $@) {
    print "could not parse $tidy_file: $error\n";
    $doc=&try_to_read_as_latin_1($tidy_file);
  }
  if(not $doc) {
    return;
  }
  my $root=$doc->documentElement;
  ## prevent a crash... 
  if(not defined($root)) {
    return;
  }
  ## extract result
  my $result;
  $result=&find_number_of_results($root,$result);
  $result=&get_results($root,$result);
  return $result;
}


## gets result from result structure
sub get_crossref_data {
  my $result=shift;
  ## shuffle half the time
  my $time=time();
  my @results;
  if(int($time/2)*2==$time) {
    if($debug > 1) {      
      print "shuffling result\n";
    }
    @results=shuffle @{$result->{'results'}};
  }
  else {
    if($debug > 1 ) {
      print "no shuffle through result\n";
    }
    @results=@{$result->{'results'}};
  }
  &wait_a_while($min_wait_results,$max_wait_results); 
  foreach my $url (shuffle @{$result->{'results'}}) {    
    ## this will return 0 if the event is denied
    my $i_got_a_file=&get_result_by_url($url);
    if($i_got_a_file) {
      &wait_a_while($min_wait_results,$max_wait_results);     
    }
  }
}

## get a result when a URL has been found
sub get_result_by_url {
  my $url=shift;
  my $file=$url;
  $file=~s|^\Q$labs_url\E||;
  ## this is the file as in the database
  my $file_as_in_db=$file;
  my $main_dirname=$crossref_dir.dirname($file);
  if($debug > 1) {
    print $main_dirname, "\n";
  }
  my $base_name=basename($file);
  if(not $base_name=~s|\.xml$||) {
    print "strange basename: $base_name\n";
  }
  my $base_name_with_dir=$base_name;
  my $count_replacements=$base_name_with_dir=~s|\W|/|g;
  ## if we don't have a replacement, place a single
  ## slash right into the middle.
  if(not $count_replacements) {
    my $half=int(length($base_name)/2);
    if($debug > 1 ) {
      print "I  have to cut $base_name\n";
    }
    $base_name_with_dir=substr($base_name,0,$half);
    $base_name_with_dir.='/';
    $base_name_with_dir.=substr($base_name,$half);
  }
  ## prevent an invisibel file name
  if($base_name_with_dir=~m|/$|) {
    chop $base_name_with_dir;
  }
  ## now finally compose $file
  $file="$main_dirname/$base_name_with_dir.xml";
  if(not -d dirname($file)) {
    mkpath(dirname($file));
  }
  if($debug>0) {
    print "file is $file\n";
  }
  ## check if the file exists and is old
  #if(-f $file and not &can_i_download($file_as_in_db,'results',$wait_until_renewal)) {
  #  return 0;
  #}
  ## only check if old, beacuse above code would reload in a multi-machine setting
  ## this function returns one if we can download
  if(not &can_i_download($file_as_in_db,'results',$wait_until_renewal)) {
    return 0;
  }
  ## junk again. I have to clean out the \r
  my $junk=&query_to_file_or_return($url);  
  ## check if this returns something that starts with "query error"
  if($junk=~m|^\Qquery error\E|) {
    &delete_event($file_as_in_db,'results'); 
    return 1;
  }
  ## if there the DOI is not foud 
  if($junk=~m|DOI Not Found|i) {
    &delete_event($file_as_in_db,'results'); 
    if($debug>0) {
      print "DOI not found error\n";
    }
    return 1;
  }
  if($junk=~m|^\s*The\s+filetype\s+is\s+not\s+supported\s*$|i) {
    &delete_event($file_as_in_db,'results'); 
    if($debug>0) {
      print "filetype error: '$junk'\n";
    }
    return 1;
  }
  if($junk=~m|\s+not\s+found\s+in\s+CrossRef\s*</error>|i) {
    &delete_event($file_as_in_db,'results'); 
    if($debug>0) {
      print "not found in CrossRef error: '$junk'\n";
    }
    return 1;
  }
  if($junk=~m|^\s*$|i) {
    &delete_event($file_as_in_db,'results'); 
    if($debug>0) {
      print "result is empty at CrossRef error: '$junk'\n";
    }
    return 1;
  }
  if($debug>1) {
    print "results query return '$junk'\n";
  }
  ## guess the encoding
  my $decoded=decode_via_guess($junk);
  ## the ^M makes XML go non-well-formed
  $decoded=~s|\r| |g;
  my $out_fh = new IO::File;
  $out_fh->open("> $file");
  $out_fh->binmode(":utf8");
  print $out_fh $decoded;
  $out_fh->close;
  return 1;
}


## finds number of queries, a limit after 
## which to end run
sub find_query_number {
  my $number=`/bin/grep -c ^ $queries_file`;
  chomp $number;
  return $number;
}

## does all queries, at least if run in sequential mode
sub do_all_queries {
  my $count_queries=0;
  my $max_queries=&find_query_number;
  while($count_queries++ < $max_queries) {
    my $exp=&get_a_query($mode,$count_queries);
    if(&can_i_download($exp,'queries',$wait_until_renewal)) {
      if($debug>0) {
        print "I can do query '$exp'\n";
      }
      my $result=&do_query($exp);
      ## this return nothing on error
      if(not defined($result) or not $result) {
        ## delete event, and wait
        &delete_event($exp,'queries');
        &wait_a_while($min_wait_queries,$max_wait_queries);
        next;
      }
      &get_crossref_data($result);
      &wait_a_while($min_wait_queries,$max_wait_queries);
    }
    else {
      if($debug>0) {
        print "not doing query '$exp'\n";
      }
    }
  }
}


## does a query
sub do_query {
  my $exp=shift;
  my $query=$sigg_url.$exp;
  my $junk_file = "$opt_dir/junk";
  my $tidy_file = "$opt_dir/tidy.xml";
  ## don't add junk file at the next line
  my $junk=&query_to_file_or_return($query);
  ## check if this returns something that starts with "query error"
  if($junk=~m|^\Qquery error\E|) {
    &delete_event($query,'queries'); 
  }
  &junk_to_tidy_file($junk,$junk_file,$tidy_file);
  my $result=&parse_tidy_file($tidy_file);
  if($debug>1) {
    print Dumper $result;
  }
  return $result;
}


## gets a new query
sub get_a_query {
  my $mode=shift;
  my $count=shift;
  my $exp;
  if($mode eq 'random') {
    $exp=`rl -c 1 $queries_file`;
    chomp $exp;
    return $exp;
  }
  ## it's not a really efficient way to get a random line,
  ## but with a large number, it would be better than
  ## storing all the queries in memory
  if($mode eq 'sequential') {
    $exp=`head -$count $queries_file | tail -1`;
    chomp $exp;
    return $exp;
  }
  print "fatal: no such mode: $mode\n";
  exit;
}

## check in a table, if the key has been used
sub can_i_download {
  ## the key
  my $exp = shift;
  ## name of the table
  my $table=shift;
  ## time limit after there will be refresh
  my $limit=shift;
  my $time = time();
  ## the column is called event, whatever the table
  my $sth = $dbh->prepare("SELECT time FROM $table WHERE event=?");  
  my $rv = $sth->execute($exp);
  my @row_array = $sth->fetchrow_array;
  if(not @row_array) {
    if($debug>0) {
      print "inserting a new event '$exp'\n";
    }
    $sth = $dbh->prepare("INSERT INTO $table (event,time) VALUES(?,?) ON DUPLICATE KEY UPDATE time=?");
    $rv = $sth->execute($exp,$time,$time); 
    return 1;
  }
  my $last_update=$row_array[0];
  if($time-$last_update>$limit) {
    if($debug>0) {
      print "renew '$exp', last update was $last_update\n";
    }
    return 1;
  }
  if($debug>0) {
    print "I need to skip '$exp', last update was $last_update\n";
  }
  return 0;
}


## delete an event, just in case it has ended in an error
sub delete_event {  
  my $exp = shift; ## the key
  ## name of the table
  my $table=shift;
  ## time limit after there will be refresh
  my $limit=shift;
  my $time = time();
  ## the column is called event, whatever the table
  my $sth = $dbh->prepare("DELETE FROM $table WHERE event=?");  
  my $rv = $sth->execute($exp);
  if($debug>0) {
    print "deleted form table $table key '$exp', return value was $rv\n";
  }
  return 0;
}


## gets a url, returns or stores it
sub query_to_file_or_return {
  my $search_url=shift;
  ## if absent, return contents
  my $dest_file=shift;
  my $dest_fh;
  ## put to file if defined
  if(defined($dest_file)) {
    $dest_fh = new IO::File;
    $dest_fh->open("> $dest_file");
  }
  #binmode($junk_fh,"utf8");
  ## create a request
  my $req = HTTP::Request->new(GET => $search_url, \@ns_headers);
  ## pass request to the user agent and get a response back
  my $res = $ua->request($req);
  ## Check the outcome of the response
  if ($res->is_success) {
    ## deals with compressed response,
    ## but we do the charset later
    my $contents=$res->decoded_content(charset=>'none');
    if(defined($dest_file)) {
      if($debug>0) {
        print "printing to destination file $dest_file\n";
      }
      print $dest_fh $contents;
      $dest_fh->close;
    }
    else {
      if($debug) {
        print "returning contents\n";
      }
      return $contents;
    }
  }
  else {    
    return "query error: ", $res->status_line, "\n";    
  }
}

sub junk_to_tidy_file {
  my $junk=shift;
  my $junk_file=shift;
  if(not defined($junk_file)) {
    $junk_file="$opt_dir/junk";
  }
  ## some tempary code to save the junk to 
  ## look for char probelms.
  #if($debug > 1) {
  #  my $time=time;
  #  my $junk_copy_file="$junk_file.$time";
  #  system("cp $junk_file $junk_copy_file");
  #}
  my $tidy_file = shift;
  if(not defined($tidy_file)) {
    $tidy_file="$opt_dir/tidy.xml";
  }
  $junk=~s|method=GET|method="GET"|g;
  $junk=~s|type=text|type="text"|g;
  $junk=~s|name=q|name="q"|g;
  $junk=~s|size=(\d+)|size="$1"|g;
  $junk=~s|maxlength=(\d+)|maxlength="$1"|g;
  $junk=~s|type=submit|type="submit"|g;
  $junk=~s|&nbsp;| |g;
  $junk=~s|\0240||g;
  $junk=~s|&|&amp;|g;
  $junk=~s|\Q name="addSearch">\E|>|g;
  my $junk_fh = new IO::File;
  $junk_fh->open("> $junk_file");
  $junk_fh->binmode(':utf8');
  print $junk_fh $junk;
  ## use tidy 
  my $s="/usr/bin/tidy -raw -quiet $junk_file > $tidy_file 2> /dev/null";
  if($debug>0) {
    print "donig '$s'\n";
  }
  system($s);
  return $tidy_file;
}

sub wait_a_while {
  my $min_wait=shift;
  my $max_wait=shift;
  my $wait=$min_wait + int(rand($max_wait-$min_wait));
  if($debug>0) {
    print "I wait $wait\n";
  }
  sleep $wait;
}

sub open_mysql {
  my %cfg = parse_defaults "my", qw(client);
  my $dbh = DBI->connect("DBI:mysql:sigg;host=sigg.openlib.org", 
                         $cfg{'user'}, 
                         $cfg{'password'}); 
  ## this could fail because of a lack of internet connection
  if(not defined($dbh)) {
    &wait_a_while($min_wait_queries,$max_wait_queries);
    $dbh=&open_mysql();
  }
  my $sth = $dbh->prepare("SET collation_connection = utf8_general_ci");  
  my $rv = $sth->execute();
  return $dbh;
}

sub find_number_of_results {
  my $doc=shift;
  my $response=shift;
  my @div_elements=$doc->getElementsByTagName('div');
  my $number_of_results; 
  foreach my $div_element (@div_elements) {
    #print $div_element->toString;
    my $id_attribute=$div_element->getAttribute('id') or next;
    if(not $id_attribute eq 'query') {
      next;
    }
    my $results_string=$div_element->textContent;
    $results_string=~m|(\d+) results? for query: (.*)|;
    $response->{'pretended_results'}=$1;
    $response->{'pretended_query'}=$2;    
    return $response;
  }
}
  

sub get_results {
  my $doc=shift;
  my $response=shift;
  my @a_elements=$doc->getElementsByTagName('a');
  my $count=0;
  foreach my $a_element (@a_elements) {
    my $anchor_string=$a_element->textContent;
    if(not $anchor_string =~m|\[xml\]|) {
      next;
    }
    my $href=$a_element->getAttribute('href');
    $response->{'results'}->[$count++]=$href;
  }
  ## prevent the next line from bombing out when there are no
  ## rusults 
  if(not defined($response)) {
    return;
  }
  if(not defined($response->{'results'})) {
    return;
  }
  if($response->{'pretended_results'} != scalar(@{$response->{'results'}})) {
    print "response parsing failed integrety: \n", $doc->toString(2);
    print "pretended: ", $response->{'pretended_results'}, "\n";
    print "actual: ", scalar(@{$response->{'results'}}), "\n";
    #exit;
  }
  elsif($debug>0) {
    print "pretended results: ", scalar(@{$response->{'results'}}), "\n";
  }
  return $response;
}

## try to decode via guess, otherwise fall back to utf-8
## based on "use Encode::Guess qw/latin-1/;"
sub decode_via_guess {
  my $junk=shift;
  my $decoded;
  my $error;
  my $debug=0;  
  eval { 
    $decoded=decode("Guess", $junk);
  };
  if(defined($decoded) and $junk ne $decoded and $debug >1) {
    print "junk is '$junk'\n\n";
    print "decoded is '$decoded'\n\n";
  }
  my $first_error=$@;
  if(not $first_error) {
    return $decoded;
  }
  if($debug>1) {
    print "first error '$first_error' decoding\n'$junk'\n\n";
  }
  if($first_error=~m|\Qiso-8859-1 or utf8\E|) {
    eval { 
      $decoded=decode_utf8($junk);
    };
    my $second_error=$@;
    if(not $second_error) {
      return $decoded;      
    }
    return $junk;
    print "second error '$second_error' decoding\n'$junk'\n\n";
  }
  return $junk;
}
