#!/usr/bin/perl

use strict;
use warnings;
use open ':utf8';
use Carp::Assert;
use Data::Dumper;
use File::Copy;
use File::Basename;
use File::Path;
use List::Util qw(shuffle);
use IO::File;
use Compress::Zlib;
use Digest::MD5 qw(md5_hex);
use XML::LibXML;


## files and diretories
my $home_dir=$ENV{'HOME'};
my $data_dir="$home_dir/data";
my $opt_dir="$home_dir/opt";
my $var_dir="$home_dir/var";
my $crossref_dir="$var_dir/crossref";

## binmode of run
binmode(STDOUT,':utf8');


## url constants
my $labs_url='http://api.labs.crossref.org';
my $sigg_url=$labs_url.'/search?q=';
##my $search_url='http://trabbi.openlib.org/home/krichel';

## run parameters
$|=1;

my $dom = XML::LibXML->new();
$dom->no_network(1);


foreach my $file (`find $crossref_dir -name '*.xml'`) {
  chomp $file;
  my $doc;
  eval { $doc=$dom->parse_file($file);};
  my $error=$@;
  if($error) {
    print "error parsing file '$file': '$error'\n";
    next;
  }
  my $root=$doc->documentElement;
  ## search for the error element
  my @error_elements=$root->getElementsByTagName('error');
  if(scalar @error_elements) {
    my $errors;
    foreach my $error_element (@error_elements) {
      my $error=$error_element->textContent;
      $errors.="$error ";
    }
    chop $error;
    ## this is known issue that we ignore here
    if($errors=~m|not found in CrossRef|) {
      next;
    }
    else {
      print "error element found in file: '$error'\n";
    }
  }
  my @doi_data_elements=$root->getElementsByTagName('doi_data');
  foreach my $doi_data_element (@doi_data_elements) {
    my $parent_element=$doi_data_element->parentNode;
    print &study_item($parent_element,$file);
    print "\n";
  }
}
  
## return a reason string, or converted data
sub study_item {
  my $in=shift;  
  my $summary=shift;
  if(not $in->getElementsByTagName('full_title') and 
     not $in->getElementsByTagName('title')
    ) {
    return "no title";
  }
  if(not $in->getElementsByTagName('contributors')) { 
    return "no contributors";
  }
  ## find the year, if necessary upwards
  my @year_elements=&find_child_upwards($in,'year');
  if(not @year_elements) {
    return "no year";
  }
  my $max_year=0;  
  if(ref @year_elements eq "XML::LibXML::Element") {
    $max_year=@year_elements->textContent;
    return "year is year";
  }
  foreach my $year_element (@year_elements) {
    if(not defined $year_element) {
      next;
    }
    my $year=$year_element->textContent;
    if($year > $max_year) {
      $max_year=$year;
    }
  }
  if(not $max_year) {
    return "no year";
  }
  ## check there is only one doi_data_element
  my @doi_data_elements=$in->getChildrenByTagName('doi_data');
  assert(scalar @doi_data_elements == 1);
  my $doi_data_element=$doi_data_elements[0];
  ## check there is only one doi, note the use of Child
  my @doi_elements=$doi_data_element->getChildrenByTagName('doi');
  assert(scalar @doi_elements == 1);
  my $doi=$doi_elements[0]->textContent;
  my @resource_elements=$doi_data_element->getChildrenByTagName('resource');
  ## check there is only one url
  assert(scalar @resource_elements == 1);
  my $file=$max_year.'/'.substr(md5_hex($doi),0,3).'.amf.xml';
  return "file: $file";
}


## searches for the element, if necessary in the 
## parent
sub find_child_upwards {
  my $element=shift;
  my $child_name=shift;
  my @elements=$element->getElementsByTagName($child_name);
  if(@elements) {
    return @elements;
  }
  if($element->parentNode) {
    &find_child_upwards($element->parentNode,$child_name);
  }
  return undef;
}

