#!/usr/bin/perl

use strict;
use warnings;
use XML::LibXML;
use Encode qw(is_utf8 encode decode);
use Storable;
use Data::Dumper;
use List::Util qw(shuffle);

## files and directorise
my $home_dir=$ENV{'HOME'};
## default place where to look for profiles,
## if nothing else is given as the first argument
my $where_to_search="$home_dir/opt/profiles";
my $queries_file="$home_dir/opt/queries";
my $titles_file="$home_dir/opt/titles.dump";

## ns constants
my $acis_ns='http://acis.openlib.org';
my $amf_ns='http://amf.openlib.org';

## global variables
my $queries;
my $titles;

## global actions
my $parser = XML::LibXML->new();
$parser->no_network(1);

## deal with pontential argument
if(defined($ARGV[0])) {
  $where_to_search=$ARGV[0];
}
else {
  $where_to_search="$home_dir/opt/profiles";
}

## main call, poplaletes queris and titles
&work_with_files($where_to_search);

## parameter gives upper limit to the talovaya
## the higher the limit the more queries
&add_titles_to_queries(0.1);
## write out queries
&write_queries_to_file($queries_file);


sub work_with_files {
  my $input=shift;
  if(-l $input) {
    ## add a slash if it is a link
    if(not $input=~m|/$|) {
      $input="$input/";
    }
  }
  elsif(-f $input) {
    &form_queries_from_file($input);
  }
  if(-d $input or -l $input) {    
    foreach my $file (`find $input -type f -name '*.amf.xml'`) {
      chomp $file;
      &form_queries_from_file($file);
    }
  }
  else {
    print "invalid input: '$input'\n";
  }
}

##
sub write_queries_to_file {
  my $queries_file=shift;
  open my $out, ">:encoding(utf8)", $queries_file or die;
  my $max_decode=3;
  foreach my $query (shuffle keys %{$queries}) {
    print $out $query, "\n";
  }
  close($out);
}


##
sub form_queries_from_file {
  my $file=shift;
  ## this also puts title is $titles;
  my $extract=&extract_data_from_file($file);
  &find_author_expressions($extract);
  ## add co-authors to queries
  &find_coauthor_expressions($extract);
}

sub find_author_expressions {
  my $extract=shift;
  my $last_name=$extract->{'familyname'};
  my $given_name=$extract->{'givenname'};
  my $additional_name=$extract->{'givenname'};
  my $name_suffix=$extract->{'namesuffix'};
  my $name_prefix=$extract->{'nameprefix'};
  my $variations=$extract->{'variations'};
  if(defined($variations)) {
    foreach my $variation (keys %{$variations}) {
      if(not defined($queries->{$variation})) {
        $queries->{$variation}=0;
      }
      $queries->{$variation}++;
    }
  }
  ## fixme: add home-grown variations
}


##
sub find_coauthor_expressions {
  my $extract=shift;
  my $last_name=lc($extract->{'familyname'});
  foreach my $name (keys %{$extract->{authors}}) {
    ## cut initials
    #print "name '$name' is ";
    ## normalize space
    $name=&normalize_space($name);
    ## try not to use first names
    ## if there is a comma
    if($name=~m|,|) {
      $name=~s|,.*||;      
    }
    else {
      ## remove first component   
      $name=~s|^\S+\s||; 
    }
    ## remove initials
    my $count_initials_removed=0;
    while($name=~s|[A-Z]\.||) {
      $count_initials_removed++;
    }
    ## normalize space
    $name=&normalize_space($name);
    if(not $name) {
      next;
    }
    $name=lc($name);
    ## skip if the last name is contained
    if($name=~m|\Q$last_name\E|i) {
      next;
    }
    ## add author name at beginning a
    my $co_name="$last_name $name";
    if(not defined($queries->{$co_name})) {
      $queries->{$co_name}=0;
    }
    $queries->{$co_name}++;
    ## add author name at end
    $co_name="$name $last_name";
    if(not defined($queries->{$co_name})) {
      $queries->{$co_name}=0;
    }
    $queries->{$co_name}++;
  }
}

##
sub normalize_space {
  my $in=shift;
  $in=~s|^\s+||;
  $in=~s|\s+$||;
  $in=~s|\s+| |;
  return $in;
}


##
sub extract_data_from_file {
  my $file=shift;
  my $fh;
  ## output structure computed from the profile
  my $out;
  open $fh,"< $file";
  binmode $fh; # drop all PerlIO layers possibly created by a use open pragma
  my $doc = eval{ $parser->load_xml(IO => $fh)};
  my $root_element=$doc->documentElement;
  ## person element, we look only at the first one
  my $person_element=$root_element->getElementsByTagName('person')->[0];
  ## fixme: should use namespace here
  foreach my $adjectiv ('name','familyname','givenname','namesuffix','nameprefix') {
    ## only consider the first occurance
    ## fixme: this should really look for children of the first person elemnent
    my $adjectiv_element=$person_element->getElementsByTagNameNS($amf_ns,$adjectiv)->[0];
    if(not $adjectiv_element) {
      next;
    }
    $out->{$adjectiv}=$adjectiv_element->textContent;
  }
  foreach my $external ('shortid') {
    ## only consider the first occurance
    ## fixme: this should really look for children of the first person elemnent
    my $external_element=$person_element->getElementsByTagNameNS($acis_ns,$external)->[0];
    if(not $external_element) {
      warn "no shortid for". $person_element->toString(2);
      next;
    }
    $out->{$external}=$external_element->textContent;
  }
  ## now collect the name variations
  my @variation_elements=$person_element->getElementsByTagNameNS($acis_ns,'variation');
  foreach my $variation_element (@variation_elements) {
    my $variation=$variation_element->textContent;
    $out->{'variations'}->{$variation}=1;
  }
  my @isauthorof_elements=$root_element->getElementsByTagName('isauthorof'); 
  ## collect texts that the person has authored
  my $count=0;
  my $shortid=$out->{'shortid'};
  my $accept_count=0;
  foreach my $isauthorof_element (@isauthorof_elements) {
    my @text_elements=$isauthorof_element->getElementsByTagName('text'); 
    foreach my $text_element (@text_elements) {
      ##$out->{'texts'}->[$count++]=$text_element;
      ## computes a summary of the text element and adds it to $out
      $out=&summarize_text_authors($text_element,$out);
      ## then collect the title
      my $title=$text_element->getElementsByTagName('title')->[0]->textContent;
      $titles->{$shortid}->{'a'}->[$accept_count++]=$title;
    }
  }
  my $reject_count=0;
  my @hasnoconnectionto_elements=$root_element->getElementsByTagNameNS($acis_ns,'hasnoconnectionto'); 
  foreach my $hasnoconnectionto_element (@hasnoconnectionto_elements) {
    my @text_elements=$hasnoconnectionto_element->getElementsByTagName('text');
    foreach my $text_element (@text_elements) {
      ## just collect the title
      my $title=$text_element->getElementsByTagName('title')->[0]->textContent;
      $titles->{$shortid}->{'r'}->[$reject_count++]=$title;
    }
  }
  ## add lastname to titles
  $titles->{$shortid}->{'name'}=lc($out->{'familyname'});
  return $out;
}

##
sub summarize_text_authors {
  my $text_element=shift;
  my $out=shift;
  #print $text_element->toString(2);
  ## fixme: this ignores possible subdivision of names, but
  ## they are not likely in the data that we have
  my @name_elements=$text_element->getElementsByTagName('name'); 
  foreach my $name_element (@name_elements) {
    my $name=$name_element->textContent;
    if(not defined($out->{'authors'}->{$name})) {
      $out->{'authors'}->{$name}=0;
    }
    $out->{'authors'}->{$name}++;
  }
  return $out;
}

  
## 
sub add_titles_to_queries {
  my $limit=shift;
  ## internal temproray storage
  my $t;
  foreach my $shortid (keys %$titles) {
    #print "shortid is $shortid\n";
    if(not $shortid) {
      next;
    }
    foreach my $status ('a','r') {
      #print "status is $status\n";
      if(not defined($titles->{$shortid}->{$status})) {
        next;
      }
      my @titles=@{$titles->{$shortid}->{$status}};
      foreach my $title (@titles) {
        #print "title is $title\n";
        foreach my $term (split(/\W+/,$title)) {
          my $term=lc($term);
          if($term=~m|^\d+$|) {
            next;
          }      
          if(length($term)<2) {
            next;
          }
          if(not defined($t->{$shortid}->{$status}->{$term})) {
            $t->{$shortid}->{$status}->{$term}=0;
          }
          $t->{$shortid}->{$status}->{$term}++;
          if(not defined($t->{$shortid}->{'t'}->{$status})) {
            $t->{$shortid}->{'t'}->{$status}=0;
          }
          $t->{$shortid}->{'t'}->{$status}++;
          if(not defined($t->{'total'}->{$status}->{$term})) {
            $t->{'total'}->{$status}->{$term}=0;
          }
          $t->{'total'}->{$status}->{$term}++;
          if(not defined($t->{'total'}->{'t'}->{$term})) {
            $t->{'total'}->{'t'}->{$term}=0;
          }
          $t->{'total'}->{'t'}->{$term}++;
          if(not defined($t->{'grand'})) {
            $t->{'grand'}=0;
          }
          $t->{'grand'}++;
        }
      }
    }
    ## now set the own document ratio
    my $number_of_accepted;
    if(defined $titles->{$shortid}->{'a'}) {    
      $number_of_accepted=scalar(@{$titles->{$shortid}->{'a'}});
    }
    else {
      $number_of_accepted=0;
    }
    my $number_of_refused;
    if(defined $titles->{$shortid}->{'r'}) {    
      $number_of_refused=scalar(@{$titles->{$shortid}->{'r'}});
    }
    else {
      $number_of_refused=0;
    }
    ## nothing in the profile?
    if($number_of_accepted+$number_of_refused<1) {
      next;
    }
    $t->{$shortid}->{'odr'}=$number_of_accepted/($number_of_accepted+$number_of_refused);
  }
  ## start with the talovaya
  my $talovaya;
  foreach my $shortid (keys %$titles) {
    #print "shortid is $shortid\n";
    if(not $shortid) {
      next;
    }
    ## calculate use of term by others
    #print "$term $shortid\n";
    ## calcultate for accepted terms only
    foreach my $term (keys %{$t->{$shortid}->{'a'}}) {  
      if(not defined($t->{$shortid}->{'r'}->{$term})) {
        $t->{$shortid}->{'r'}->{$term}=0;
      }
      if(not defined($t->{$shortid}->{'t'}->{'r'})) {
        $t->{$shortid}->{'t'}->{'r'}=0;
      }
      my $other_use=($t->{'total'}->{'t'}->{$term}-
                     $t->{$shortid}->{'a'}->{$term}-
                     $t->{$shortid}->{'r'}->{$term}
                    ) 
        /
          ($t->{'grand'}-
           $t->{$shortid}->{'t'}->{'a'}-
           $t->{$shortid}->{'t'}->{'r'});
      my $numerator=($t->{$shortid}->{'odr'})*$other_use;
      if($t->{$shortid}->{'t'}->{'r'}>0) {
        $numerator+=
          (1-$t->{$shortid}->{'odr'})
            *($t->{$shortid}->{'r'}->{$term})
              / ($t->{$shortid}->{'t'}->{'r'});    
      }
      my $denominator=$t->{$shortid}->{'a'}->{$term}
        / ($t->{$shortid}->{'t'}->{'a'});
      $talovaya->{$shortid}->{$term}=$numerator/$denominator;  
    }  
  }
  foreach my $shortid (keys %{$talovaya}) {
    my $name=$titles->{$shortid}->{'name'} or warn "no shorid\n";
    foreach my $term (keys %{$talovaya->{$shortid}}) {
      if($talovaya->{$shortid}->{$term}<$limit) {
        $queries->{"$term $name"}++;
        $queries->{"$name $term"}++;
      }
    }
  }
}