###########################################################################
#
# SplitJSONFile.pm
#  -- A plugin for splitting JSON input files into segments that will
#     then be individually processed.
#
#     Inherits from SplitTextFile, overiding the relevant plugin argument
#     and functions, so the specified nested field within the JSON
#     is used as the split point
#
# Copyright 2023 The New Zealand Digital Library Project
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package SplitJSONFile;

use SplitTextFile;
use gsprintf 'gsprintf';
use util;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

# SplitJSONFile is a sub-class of ReadTextFile
sub BEGIN {
    @SplitJSONFile::ISA = ('SplitTextFile');
}


my $arguments = [
    { 'name' => "split_exp",
      'desc' => "{SplitJSONFile.split_exp}",
      'type' => "string",
      'deft' => "",
      'reqd' => "no" },
    { 'name' => "metadata_exp",
      'desc' => "{SplitJSONFile.metadata_exp}",
      'type' => "string",
      'deft' => "",
      'reqd' => "no" },
    { 'name' => "file_exp",
      'desc' => "{SplitJSONFile.file_exp}",
      'type' => "string",
      'deft' => "",
      'reqd' => "no" },

    ];

my $options = { 'name'     => "SplitJSONFile",
		'desc'     => "{SplitJSONFile.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
	        'args'     => $arguments };


sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);

    return bless $self, $class;
}


sub get_default_process_exp
{
    return q^(?i)\.json$^;
}

# The default is to assume the top-level JSON structure is an array
# Signal this by returning an empty string
sub get_default_split_exp {
    return "";
}

sub split_text_into_segments {
    my $self = shift (@_);
    my ($textref) = @_;

    my $outhandle = $self->{'outhandle'};
    my $verbosity = $self->{'verbosity'};

    my $json = JSON->new();
    my $json_text_content = JSON::from_json($$textref);

    my $json_array = [ ];
    
    # Split the text into several smaller segments
    my $split_exp = $self->{'split_exp'};

    my @split_exps = split(/\s*,\s*/,$split_exp);
    
    if (scalar(@split_exps) == 0) {
	@split_exps = ("");
    }
    
    foreach my $full_json_field (@split_exps) {

	$full_json_field =~ s/^\.?//;
	
	my @json_field_parts = ($full_json_field eq "") ? () : split(/\./,$full_json_field);

	## print STDERR "**** json_field_parts = ", JSON::to_json(\@json_field_parts), "\n";
	my $json_cursor = $json_text_content;

	my $had_error = 0;
	
	foreach my $json_field_part (@json_field_parts) {
	    $json_cursor = $json_cursor->{$json_field_part};
	    if (!defined $json_cursor) {
		print STDERR "Warning: failed to find '$full_json_field' in JSON file.  '$json_field_part' did not exist\n";
		print STDERR "-> Skipping\n";
		$had_error = 1;
		last;
	    }
	}

	if (!$had_error) {
	    
	    if (ref($json_cursor) eq "ARRAY") {
		push(@$json_array,@$json_cursor);
	    }
	    else {
		if ($full_json_field eq "") {
		    print STDERR "Warning: The top-level of the JSON file is not an array field\n";
		}
		else {
		    print STDERR "Warning: The specified field '$full_json_field' was detected in the JSON file, however it is not an array field\n";
		}
		print STDERR "-> Skipping\n";
	    }
	}
    }

    if ($verbosity>=3) {
	print $outhandle "----------\n";
	print $outhandle "SplitJSONFile -- Segments\n";
	print $outhandle "----------\n";
    }
	    
    my @segments = ();
    ## get rid of empty segments
    foreach my $seg_json_rec (@$json_array) {

	#my $seg_json_unicode_str = JSON::to_json($seg_json_rec);        # expect unicode string
	#my $seg_json_unicode_str = $json->pretty()->encode($seg_json_rec); # expect unicode string
	my $seg_json_unicode_str = $json->encode($seg_json_rec); # expects unicode string

	if ($verbosity>=3) {
	    my $seg_json_utf8_printable_str = Encode::encode("utf8",$seg_json_unicode_str);

	    print $outhandle "  --------\n";
	    print $outhandle "  $seg_json_utf8_printable_str\n";
	    print $outhandle "  --------\n";
	}

	if ($seg_json_unicode_str ne ""){
	    push @segments, $seg_json_unicode_str;
	}
    }

    return \@segments;
}



1;
