/**
 *#########################################################################
 * GSearchConnection.java - works with the demo-client for Greenstone 3, 
 * of the Greenstone digital library suite from the New Zealand Digital 
 * Library Project at the  * University of Waikato, New Zealand.
 * <BR><BR>
 * Copyright (C) 2008 New Zealand Digital Library Project
 * <BR><BR>
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * <BR><BR>
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *########################################################################
 */

package org.greenstone.fedora.services;

import java.util.Vector;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;

import java.net.URL;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.rpc.ServiceException;
import java.net.MalformedURLException;

import org.apache.axis.client.Call;
import org.apache.axis.client.Service;
import org.apache.log4j.Logger;

import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;


/** 
 * Class GSearchConnection connects to FedoraGSearch's web services.
 * FedorGSearch offers indexing and full-text search functionality for 
 * Fedora repositories. Its search web service (method gFindObjects)
 * returns the response of a search as XML.
 * GSearchConnection offers more convenient methods that extract just 
 * the parts of search results that FedoraGS3Connection needs and returns
 * that.
 * @author ak19 
*/
public class GSearchConnection implements FedoraToGS3Interface.Constants {
	/** Logger for this class. */
	private static final Logger LOG = Logger.getLogger(
			GSearchConnection.class.getName());
	
	/* Accessing the web services of Fedora Generic Search */
	protected static String NAMESPACE_URI = "http://server.fedoragsearch.defxws.dk";
	protected static String SERVICE_NAME = "OperationsService"; 

	/** The names of the methods we use of Fedora Generic Search's web services
	 * are declared here as static final Strings. */
	protected static final String G_FIND_OBJECTS = "gfindObjects";
	
	/* Some fixed string literals that will be encountered in the response XMLs
	 * that FedoraGSearch's method gFindObjects() returns. */
	protected static final String PID = "PID";
	protected static final String HIT_TOTAL = "hitTotal";
	protected static final String OBJECT = "object";
	protected static final String FIELD = "field";
	protected static final String NAME = "name";
	protected static final String DC_TITLE_FIELD = "dc.title";
	protected static final String FULLTEXT_FIELD = "ds.fulltext";
	public static final String ALL_INDEXED_FIELDS = "foxml.all.text";
	
	/** separator used internally to separate values of a search field */ 
	protected static final String SPACE = " ";

        /** The name of the Index wherein FedoraGSearch has indexed all the GS3 docs.
	 * This final member is public here so that others may read the indexName  
	 * that this GSearchConnection works with. */
	public final String indexName; 
	
	/** The Service object used to connect to the FedoraGSearch web services */
	protected final Service service;
	/** The Call object used to connect to the FedoraGSearch web services */
	protected final Call call;
	/** The portName object used when connecting to FedoraGSearch's web services */
	protected final QName portName;
	
	/** A DocumentBuilder object used to construct and parse XML */
	protected final DocumentBuilder builder;

    
	/** Constructor that takes a String representing the url of the WSDL
	 * file for FedoraGSearch's web services, and tries to establish a 
	 * connection to those web services. 
	 * @param wsdlFileLocation is a String representing the url of the WSDL file
	 * @param indexName is the name of the index that Fedora Generic Search
	 * should work with (the index wherein the indexed GS3 documents have been
	 * placed). 
	*/
	public GSearchConnection(String wsdlFileLocation, String indexName) 
		throws MalformedURLException, ServiceException, 
			ParserConfigurationException
	{ 
		this.indexName = indexName;
		
		URL wsdlURL = new URL(wsdlFileLocation);
		service = new Service(wsdlURL, new QName(NAMESPACE_URI, SERVICE_NAME));
		//call = (Call) service.createCall(new QName(NAMESPACE_URI, PORT_NAME));
		
		Iterator i = service.getPorts();
		// FIXME: can we just assume it's the first port of service SERVICE_NAME?
		// Do we need to work out which port to get??? Remember, the port names  
		// vary between wsdls though!
		if(i.hasNext()) {
			portName = (QName)i.next();
			call = (Call) service.createCall(portName);
		
			String endpointLocation = call.getTargetEndpointAddress();
			LOG.debug("Wsdl file url: " + wsdlURL 
					+ "\nEndpoint location is: " + endpointLocation);
		} else { // should never happen: a service without a port
			// portName = null;
			call = (Call)service.createCall(); 
				// FIXME: possibly manually get the ports and choose
				// one containing "FEDORA" and "API-A" in its name?
			throw new ServiceException(this.getClass() + ": No port in wsdl file");
		}
		
		// we can set the portName which remains constant for the various methods
		// call.setPortName(portName);
		
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
		builder = factory.newDocumentBuilder(); // to create XML docs
	}

	/**
	 * Method to invoke gfindObjects operation of Fedora Generic Search
	 * web services.
	 * 
	 * Parameter types, parameter order and return type of gFindObjects are as  
	 * obtained from the wsdl file for the Fedora Generic Search web services 
	 * located at:
	 * http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl
	 *  &lt;wsdl:message name="gfindObjectsRequest"&gt;
	 *  &lt;wsdl:part name="query" type="xsd:string"/&gt;
	 *  &lt;wsdl:part name="sort" type="xsd:string"/&gt;
	 *  &lt;wsdl:part name="hitPageStart" type="xsd:int"/&gt;
	 *  &lt;wsdl:part name="hitPageSize" type="xsd:int"/&gt;
	 *  &lt;wsdl:part name="snippetsMax" type="xsd:int"/&gt;
	 *  &lt;wsdl:part name="fieldMaxLength" type="xsd:int"/&gt;
	 *  &lt;wsdl:part name="indexName" type="xsd:string"/&gt;
	 *  &lt;wsdl:part name="resultPageXslt" type="xsd:string"/&gt;
	 *  &lt;/wsdl:message&gt;
	 * 
	 * &lt;wsdl:message name="gfindObjectsResponse"&gt;
	 * &lt;wsdl:part name="gfindObjectsReturn" type="xsd:string"/&gt;
	 * &lt;/wsdl:message&gt;
	 * 
	 * &lt;wsdl:operation name="gfindObjects" 
	 * parameterOrder="query sort hitPageStart hitPageSize snippetsMax 
	 * fieldMaxLength indexName resultPageXslt"&gt;
	 *  
	 * This method works: it searches the dc.title field of our FedoraIndex 
	 * for the term (e.g. "interview") and the result returned is an XML String.
	 *
	 * There's no example on how to call gFindObjects with parameters. In 
	 * particular, I don't know what values the parameter <b>sort</b> can take.
	 * But topazproject has an example on how to call updateIndex(). 
	 * @see <a href="http://www.topazproject.org/trac/wiki/FedoraSearch?format=txt">An example on how to call updateIndex() with parameters</a>
	 * @see <a href="http://ws.apache.org/axis/java/apiDocs/org/apache/axis/client/Service.html">Axis Service class</a>
	 * @see <a href="http://ws.apache.org/axis/java/apiDocs/javax/xml/rpc/Call.html">Axis RPC Call, for specification of interface Call</a>
	 * @see <a href="http://ws.apache.org/axis/java/apiDocs/org/apache/axis/client/Call.html">Axis client Call class, for implementation of interface Call</a> 
	*/
	protected String gFindObjects(String searchFieldedTerms, String sort, 
			int hitPageStart, int hitPageSize, int snippetsMax,
			/*int fieldMaxLength,*/ String indexName, String resultPageXslt) throws Exception
	{
		// "Prefills as much info from the WSDL as it can. Right now it's SOAPAction, 
		// operation qname, parameter types and return type of the Web Service. 
		// This method considers that port name and target endpoint address have 
		// already been set. This is useful when you want to use the same Call instance
		// for several calls on the same Port. NOTE: Not part of JAX-RPC specification."
		
		//call.removeAllParameters(); // no need for this when using setOpName below
		call.setOperationName(G_FIND_OBJECTS);
		
		// Max num of chars in field vals returned. Since return values exceeding 
		// maxlength will be truncated, ensure length suffices for long PIDs returned.
		// The only element of the response XML we'll be using is the PID of the document
		// in which the searchTerm occurred.
		final int fieldMaxLength = 100; // NOT TRUE: max length in words of field values
		      // returned. E.g. snippet sizes will be reduced to fieldMaxLength words too.

		// This is the method call for Fedora 2's GSearch
		//String valueFound =(String)call.invoke( new Object[] {
		//	searchFieldedTerms, sort, hitPageStart, hitPageSize, snippetsMax,
		//	fieldMaxLength, indexName, resultPageXslt} );

		// The method call for GSearch 2.2 of Fedora 3 takes the args in a different order:
		String valueFound =(String)call.invoke( new Object[] {
			searchFieldedTerms, hitPageStart, hitPageSize, snippetsMax,
			fieldMaxLength, indexName, sort, resultPageXslt} );
		
		// for debugging
		//javax.swing.JOptionPane.showMessageDialog(null, "GSearchConnection.gFindObjects:" + valueFound);
		//LOG.error("gfindObjects result: " + valueFound);

		return valueFound;
	}
	
	/**
	 * Method that performs a search for the given searchTerm inside the given
	 * indexed field.
	 * @param searchFieldName is the name of the indexed field within which the
	 * given searchTerm is to be searched for.
	 * @param searchTerm is the term to be searched for.
	 * @param hitPageStart is the page of search results to start returning.
	 * @param hitPageSize is the number of search result pages to return, 
	 * starting from hitPageStart.
	 * @param snippetsMax is the maximum number of separate snippets containing  
	 * the searchTerm that are to be returned. (snippetsMax or a fewer number of 
	 * occurrences of the word in the text will be returned)
	*/
	public String search(String searchFieldName, String searchTerm, 
			int hitPageStart, int hitPageSize, int snippetsMax) throws Exception
	{
		final String sort = ""; // returns results from highest to lowest rank
		final String resultPageXslt = "";		
		
		// when a fieldname is given to search in (ds.fulltext, dc.title)
		// then prepend that followed by a COLON to the searchTerm.
		final String fullSearchTerm = searchFieldName.equals("") ?
				searchTerm : (searchFieldName+":"+searchTerm); 
		
		return gFindObjects(fullSearchTerm, sort, 
			hitPageStart, hitPageSize, snippetsMax, 
			indexName, resultPageXslt);
	}
	
	/** 
	 * FedoraGSearch accepts a query of the form: 
	 * <code>&lt;"cyclone val" "Gender Inequalities" ds.fulltext:"cyclone val"
	 * ds.fulltext:"worst storm"&gt;</code>
	 * where the first two phrases are searched for in all indexed fields,
	 * (in this case dc.title and ds.fulltext), while the last two are 
	 * searched for in the ds.fulltext field.
	 * Another example: 
	 * <code>&lt;gender dc.title:interview ds.fulltext:"cyclone val"&gt;
	 * titles and fulltexts are searched for "gender", while title index
	 * is searched for "interview" and fulltexts are searched for the phrase
	 * "cyclone val"</code>
	 * @param fieldsToSearchTerms is a Hashmap of searchfields and 
	 * associated search terms (words or phrases). The terms are in a 
	 * comma-separated list. fieldsToSearchTerms is a Hashmap of 
	 * (Searchfields, associated-searchTerms) pairs. It can contain 3 
	 * searchfields: allfields, titles, text. The value for each is a 
	 * comma-separated list of search terms in that field.
	 * Internally the field names get converted to what FedoraGSearch's
	 * gfindObjects understands: titles becomes dc.title:, text becomes 
	 * ds.fulltext and allfields becomes nothing.
	 * @param hitPageStart is the page of search results to start returning.
	 * @param hitPageSize is the number of search result pages to return, 
	 * starting from hitPageStart.
	 * @return the XML (in string format) returned from Fedora Generic Search's
	 * gfindObjects method
	 * 
	*/
	public String search(Map fieldsToSearchTerms,
			int hitPageStart, int hitPageSize) 
		throws Exception
	{
		LOG.debug("In FedoraGS3's GSearchConnection.search(Map,...)");
		
		// HashMap consists of several (key, value) entries, 3 of
		// which will be dealt with here:
		// - allfields, <comma separated list of search terms/phrases>
		// - titles, <comma separated list of search terms/phrases>
		// - (full)text, <comma separated list of search terms/phrases>
		// We need to obtain each value and change the separator to space:
		String allfields = (String)fieldsToSearchTerms.get(ALL_FIELDS);
		String titles = (String)fieldsToSearchTerms.get(ALL_TITLES);
		String fulltexts = (String)fieldsToSearchTerms.get(FULLTEXT);
		
		// Each field is a comma separated list of terms that may be 
		// either a word OR a phrase.
		// We're going to separate each term from the list,
		// and put quotes around phrases, then combine all the terms
		// together again with spaces to separate them.
		allfields = formatSearchTermsInField(allfields, ALL_FIELDS); // searches foxml.all.text
			// ALL_FIELDS has no field name
		titles = formatSearchTermsInField(titles, DC_TITLE_FIELD);
		fulltexts = formatSearchTermsInField(fulltexts, FULLTEXT_FIELD);
	
		String fullSearchTerm = allfields + titles + fulltexts;
		if(fullSearchTerm.trim().equals("")) { // nothing to search on
			return "";
		}
		
		// Finally, restrict the search to the Greenstone digital objects 
		// stored in Fedora
		final String greenstonePID 
			= PID + FedoraGS3DL.COLON + FedoraGS3DL.GREENSTONE; 
				//"PID:\"greenstone\""; 
		fullSearchTerm += greenstonePID;
			//! Everything after the colon in the pid is ignored by FedoraGSearch:
				// "PID:\"greenstone:gs2mgdemo\""; // ignores "gs2mgdemo"
		
		// <snippet> tags interfere when PID field is searched on, set it to 0
		return search(fullSearchTerm, hitPageStart, hitPageSize, 0);
		// return search(fullSearchTerm, hitPageStart, hitPageSize, snippetsMax);
	}
	
	/** Each field is a comma separated list of terms that may be either a word  
	 * OR a phrase. We're going to separate each term from the list, and put 
	 * quotes around phrases, then combine all the terms together again with 
	 * spaces to separate them. Examples: 
	 * <pre>dc.title:"a phrase" word
	 * dc.fulltext: "cyclone val"
	 * (ALL_FIELDS) interview gender</pre>
	 * This is required to facilitate fielded searching with fedoraGSearch.
	 * @param field is a comma separated list of search terms (corresponding 
	 * to one fieldName) to be reorganised
	 * @param fieldName is the name of the field to prepend to the reorganised
	 * field value. FieldName ALL_FIELDS is ignored.
	 * @return parameter field reorganised such that terms that are phrases 
	 * are in quotes and each term is separated by a space from the previous one.
	*/
	protected String formatSearchTermsInField(String field, String fieldName) 
	{	
		if(field != null) { // check that the field isn't empty
			//LOG.debug("field: " + field);
			String[] terms = field.split(",");
			field = ""; // we'll build it up again
			for(int i = 0; i < terms.length; i++) {
				// if it contains a space, then the term's a phrase,
				// put it in quotes
				if(terms[i].indexOf(SPACE) != -1) { 
					terms[i] = "\"" + terms[i] + "\"";
				}
				field = field + terms[i] + SPACE;
			}
			
			// Prefix it with the name of the field we want to search for 
			// the term in. Every field other than allfields has a prefix
			if(!fieldName.equals(ALL_FIELDS)) { 
				field = fieldName + ":" + field;
			} 
			// in older versions of GSearch (version 2.2), searching over all fields 
			// meant not specifying an index to search in. From GSearch version 2.4/2.5
			// need to search in field "foxml.all.text" to search all indexed fields.
			else {
			    field = ALL_INDEXED_FIELDS + ":" + field; //searches in foxml.all.text
			}
			
		} else field = "";
		return field;
	}
	
	/** 
	 * Uses FedoraGSearch to perform a search where the query is embedded in
	 * fieldedSearchTerms, which not only provides the terms to search on, but
	 * also the fields to search the (various) given terms in.
	 * @param fieldedSearchTerms is the String specifying all the search terms
	 * with their fields (or no field if it should search for the terms in 
	 * all fields). The terms with no associated search-fields should come first.
	 * Search terms may be in quotes.
	 * @param snippetsMax is the maximum number of separate snippets containing  
	 * the searchTerm (snippetsMax number of occurrences of the word in the text) 
	 * returned.
	 * @param hitPageStart is the page of search results to start returning.
	 * @param hitPageSize is the number of search result pages to return, 
	 * starting from hitPageStart.
	 * @return the XML (in string format) returned from Fedora Generic Search's
	 * gfindObjects method
	*/
	public String search(String fieldedSearchTerms,
			int hitPageStart, int hitPageSize, int snippetsMax) throws Exception
	{
		LOG.debug("In method search(String fieldedSearchTerms,...). "
				+ "Query is:\n" + fieldedSearchTerms);
		
		final String sort = ""; // returns results from highest to lowest rank
		final String resultPageXslt = "";		
		return gFindObjects(fieldedSearchTerms, sort, 
				hitPageStart, hitPageSize, snippetsMax, 
				indexName, resultPageXslt);
	}
	
	/** Call this method with the return value of calling search(). 
	 * Search results are returned in GSearch's XML response format, 
	 * containing information that includes the PIDs of the documents that 
	 * matched the search. These PIDs are returned in the array. 
	 * @param collectionName is the name of the collection to restrict the 
	 * search results by. If it's "", then results from all collections are
	 * returned. Generally, don't want to pass "", because, theoretically, 
	 * all indexed collections in the repository could be considered and 
	 * not all of them may be Greenstone collections. If all Greenstone 
	 * collections should be searched for, pass "greenstone" as the 
	 * collection name instead.
	 * @param searchResult is the Fedora Generic Search XML response returned 
	 * from performing a gfindObjects() operations. 
	 * @return an array of the pids of documents found for the search. */
	public String[] getPIDsFromSearchResult(String collectionName,
			String searchResult) 
		throws Exception 
	{
		final String[] empty = {};
		if(searchResult.equals("")) {
			return empty;
		}
		
		//	<?xml version="1.0" encoding="UTF-8"?>
		//	<resultPage xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:foxml="info:fedora/fedora-system:def/foxml#" xmlns:zs="http://www.loc.gov/zing/srw/" indexName="FedoraIndex" dateTime="Sat Feb 09 16:43:04 NZDT 2008">
		//	<gfindObjects hitTotal="1" resultPageXslt="" hitPageSize="10" hitPageStart="1" query="ds.fulltext:Cyclone">
		//	<objects>
		//	<object no="1" score="0.24639596">
		//	<field name="PID">greenstone:gs2mgdemo-HASH01d667303fe98545f03c14ae</field>
		//	<field name="repositoryName">Fedora</field>
		//	<field name="object.type">FedoraObject</field>
		//	<field name="object.state">Active</field>
		//	<field name="object.label">The Courier - N°159 - Sept- Oct 1996 Dossier Inves ... </field>
		//	<field name="object.createdDate">2007-11-23T04:23:15.363Z</field>
		//	<field name="object.lastModifiedDate">2008-01-15T04:37:49.518Z</field>
		//	<field name="dc.title">some title</field>
		//	<field name="dc.title">some title2</field>
		//	...
		//	<field name="ds.fulltext" snippet="yes">(The 1993 <span class="highlight">cyclone</span>, although</field>
		//	<field name="ds.label">Metadata</field>
		//	...
		//	</object>
		//	</objects>
		//	</gfindObjects>
		// 1. Get documentElement, which is <resultPage>
		Element resultPage = FedoraCommons.getResponseAsDOM(builder, searchResult);
		// 2. find the hitTotal value which is the number of results 
		// it's an attribute of the sole compulsory <gFindObjects> element
		int hitTotal = 0;
		Element gfindObjectsEl
			= (Element)resultPage.getElementsByTagName(G_FIND_OBJECTS).item(0);
		String value = gfindObjectsEl.getAttribute(HIT_TOTAL);
		hitTotal = Integer.parseInt(value);
		if(hitTotal == 0) {
			return new String[]{};
		}
		
		// Our resulting list of pids will be no more than hitTotal,
		// but may be fewer if we constrain the results to a collection
		Vector pidsInCollection = new Vector(hitTotal);
		
		// Returns a NodeList of all descendant Elements with object tagname
		NodeList objects = gfindObjectsEl.getElementsByTagName(OBJECT);
		for(int i = 0; i < objects.getLength(); i++) {
			// should be the case that pids.length == (digital)objects.getLength()
			// get the PID of each object
			Element object = (Element)objects.item(i);
			NodeList fields = object.getElementsByTagName(FIELD);
			
			for(int j = 0; j < fields.getLength(); j++) { 
				// find the sole <field> of <object> where NAME attribute == PID
				Element field = (Element)fields.item(j);
				if(field.getAttribute(NAME).equals(PID)) {
					String pid = FedoraCommons.getValue(field);
					// Either store only the pids which are part of the collection,
					// or, if no collection is specified (=""),then store the pid too
					if(collectionName.equals("") || pid.contains(collectionName)) {
						pidsInCollection.add(pid);
					}
					break; // found pid field, meaning that we have
						// finished for loop on <field>s of this <object>, 
						// consider next <object>
				}
			}
		}
		String[] pids = new String[pidsInCollection.size()];
		pidsInCollection.toArray(pids);
		return pids;
	}
	
	public static void main(String[] args) {
		try { 
			GSearchConnection searcher = new GSearchConnection(
				"http://localhost:8080/fedoragsearch/services/FgsOperations?wsdl", "FedoraIndex");
			
			
			HashMap map = new HashMap();
			map.put(GSearchConnection.ALL_FIELDS, "gender inequalities");
			map.put(GSearchConnection.FULLTEXT, "cyclone val,worst storm");
			//map.put(GSearchConnection.ALL_FIELDS, "\"gender inequalities\"");
			//map.put(GSearchConnection.FULLTEXT, "\"cyclone val\",\"worst storm\"");
			String searchResult = searcher.search(map, 1, 10); //snippetsMax: 3);
			System.out.println(searchResult);
			
			String[] pids = searcher.getPIDsFromSearchResult("gs2mgdemo", searchResult);
			System.err.println("Found pids for search:\n"); 
			for(int i = 0; i < pids.length; i++) {
				System.out.println(pids[i]);
			}
			
			//searchResult = searcher.search("", "minh", 0, 50, 50);
			//System.err.println(searchResult);
			
			//String searchTerms = "cyclone dc.title:interview dc.title:gender";
			String searchTerms="\"gender inequalities\" ds.fulltext:\"cyclone val\" ds.fulltext:\"worst storm\"";
			searchResult = searcher.search(searchTerms, 1, 10, 3);
			System.out.println(searchResult);
			
			// Not restricting results to any collection (search results from 
			// all collections)
			pids = searcher.getPIDsFromSearchResult("", searchResult);
			System.err.println("Found pids for search: "); 
			for(int i = 0; i < pids.length; i++) {
				System.out.println(pids[i]);
			} 
			
			searchResult = searcher.search("ds.fulltext", "cyclone", 1, 10, 3);
			//String searchResult = searcher.search("ds.label", "hierarchical", 1, 10, 3);
			// System.out.println(searcher.search("ds.fulltext", "Pinky", 1, 10, 3));
			System.out.println(searchResult);
			
			pids = null;
			pids = searcher.getPIDsFromSearchResult("", searchResult);
			System.err.println("Found pids for search: "); 
			for(int i = 0; i < pids.length; i++) {
				System.out.println(pids[i]);
			}
			
		}catch(Exception e) {
			System.err.println(e.getMessage());
		}
		
	}

}