/* ------------------------------------------------------------------- */
/* htget     : Fetch a file using HTTP protocol                        */
/*                                                                     */
/* Author    : Ole Husby, BIBSYS                                       */
/* Updated   : 1998-09-30                                              */
/*                                                                     */
/* ------------------------------------------------------------------- */
/*                                                                     */
/* htget(url, type, timeout_seconds, outfile, content_type, location)  */
/*                                                                     */
/*   Returns: HTTP statuscode, with additional private:                */
/*                 0 : OK ( = 200)                                     */
/*               900 : Error, possible timeout                         */
/*               901 : Syntax error in url                             */
/*               902 : Unknown host                                    */
/*               903 : No response from server (no connection)         */
/*               904 : File is not text/html                           */
/*               905 : Statusline > 255 bytes                          */
/*               906 : Statusline < 4 bytes                            */
/*               907 : Statusline not starting with "HTTP"             */
/*               908 : Statuscode not numeric                          */
/*               909 : Size of header > BUFSIZE                        */
/*               910 : Unable to open output file                      */
/*               999 : Unspecified TCP/IP error                        */
/*                                                                     */
/*   Writes to outfile, depending on type, if statuscode = 0 | 200 :   */
/*                                                                     */
/*   type = 0  : Nothing                                               */
/*   type = 1  : HTTP header                                           */
/*   type = 2  : HTTP header + entitybody                              */
/*   type = 3  : HTTP entitybody                                       */
/*   type = 4  : HTTP entitybody if text/html                          */
/*   type = 5  : HTTP <HEAD> part of entitybody if text/html           */
/*   type = 6  : HTTP <HEAD> part of entitybody if text/html           */
/*               HTTP entitybody if application/marc                   */
/*                                                                     */
/* ------------------------------------------------------------------- */


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <fcntl.h>
#include <sys/types.h>
#include <unistd.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <netdb.h>
#include <signal.h>

#define FALSE               0
#define TRUE                1
#define BUFSIZE             10000
#define TYPE_NONE           0     
#define TYPE_HTTPHEAD       1     
#define TYPE_HTTPALL        2     
#define TYPE_HTTPBODY       3     
#define TYPE_HTMLALL        4     
#define TYPE_HTMLHEAD       5     
#define TYPE_MARC           6     

#define TRACE               0

#define AGENT               "BIBSYS_htget v1.1"

char conType[128];
int htmlonly, marcrecord;



void thandler(int i)
{
}



/* ------------------------------------------------------------------- */
/* geteoHEAD: Look for </HEAD> or <BODY                                */
/*                                                                     */
/* returns: 0 if not found                                             */
/*          1 if found. Terminates buf immediately after </HEAD> or    */
/*            immediately before <BODY                                 */
/* ------------------------------------------------------------------- */

int geteoHEAD(char *buf)
{
  char *p;
  int i;

  p = (char *) cstr(buf, "<BODY");
  if (p)
  {
    i = p - buf;

    buf[i] = '\n';
    buf[i+1] = '\0';
    return 1;
  }

  p = (char *) cstr(buf, "</HEAD>");
  if (p)
  {
    i = p - buf + 7;

    buf[i] = '\n';
    buf[i+1] = '\0';
    return 1;
  }

  return 0;
}



/* ------------------------------------------------------------------- */
/* writeRequest: send request to server                                */
/*                                                                     */
/* returns: Number of bytes written                                    */
/* ------------------------------------------------------------------- */

int writeRequest(char *req, int server)
{
   if (TRACE)
      printf("*** send(): %s\n", req);

   return write(server, req, strlen(req));
}



/* ------------------------------------------------------------------- */
/* getBody : Read the Entity-body into the file filename               */
/*                                                                     */
/* Open the given file for writing, read data from the                 */
/* socket until a terminating '\0' is found, write to                  */
/* the file. Returns 0 if ok, positive if an error results             */
/* in errno being set, or -1 if other error.                           */

/* Single read()'s blocking for more than TIMEOUT_SECONDS will         */
/* be interrupted. The read() then returns a negative value, and       */
/* errno will be set appropriately (EINTR).                            */
/*                                                                     */
/* Returns   0 if ok                                                   */
/*         900 if read error                                           */
/*                                                                     */
/* ------------------------------------------------------------------- */

int getBody(int server, int timeout, int fd, char *filename)
{
   int i, ef;
   unsigned char *bf, buf[BUFSIZE + 1];
   int  bytecount;
   int  found_end;
   
   if (TRACE)
     printf("*** Read entitybody\n");

/* Loop until the endmark is found                                     */

   found_end = FALSE;


   while (!found_end) 
   {
      alarm (timeout);
      bytecount = read (server, buf, BUFSIZE);
      alarm (0);

      if (bytecount < 0) 
        return 900;                                 /* error in read() */

      else if (bytecount == 0)
        break;                                 /* server closed socket */


      else
      {
       if (buf[bytecount-1] == '\0') 
         {
            bytecount--;              /* do not write the '\0' to file */
            found_end = TRUE;                    /* terminate the loop */
	 }

/*       Write to file                                                 */

         if (bytecount > 0) 
         {
            bf = (unsigned char *) buf;
            bf[bytecount] = '\0';
            if ( htmlonly )
            {
              ef = geteoHEAD(bf);
              bytecount = strlen(bf);
            }
            else
              ef = 0;
            write (fd, bf, bytecount);
            if (ef)
              break;
         }
      }
   }

   return 0; 
}
      


/* ------------------------------------------------------------------- */
/* getHeader : Access file and read HTTP header                        */
/*                                                                     */
/* Returns   0 if ok                                                   */
/*         904 if Content-type not "text/html"                         */
/*         905 if Statusline > 255 bytes                               */
/*         906 if Statusline < 4 bytes                                 */
/*         907 if Statusline not starting with "HTTP"                  */
/*         908 if Statuscode not numeric                               */
/*         909 if size of header > BUFSIZE                             */
/*         HTTP statuscode if <> 200                                   */
/*                                                                     */
/* ------------------------------------------------------------------- */

int getHeader(int server, int timeout, int type, int fd, char *reason, char *loc)
{
   char buf[BUFSIZE+1], dummy[8];
   unsigned char *p,  *cp = buf, *d = dummy;
   int i, rc, statuscode, plf, rlen = 0;

   reason = (char *) NULL;
   *conType = 0;

   if (TRACE)
     printf("*** Read statusline\n");


/* Read HTTP statusline (until LF or 0, max 256 byte)                 */

   for ( cp = buf, *buf = 0; 1; cp++ )
   {
      alarm (timeout);
      rc = read ( server, cp, 1 );
      alarm (0);

      if ( rc < 0 )
	 return 900;

      if ( *cp == '\r')
      {
        cp--;
        continue;
      }

      rlen++;

      if (TRACE)
        printf("%c", *cp);

      if ( *cp == 0 || rlen > 255)
	 return 905;
      if ( *cp == '\n')
      {
         *cp = 0;
	 break;
      }
   }

/*  Write statusline                                                 */

   if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) )
   {
     write(fd, buf, strlen(buf));
     write(fd, "\n\n", 2);
   }

   if (TRACE)
      printf("*** read() : (%d) %s\n", rlen, buf);


/* Parse statusline                                                   */

   if (rlen < 4)
     return 906;

   if (strncmp(buf, "HTTP", 4) != 0)
     return 907;

   p = strtok(buf, " ");
   p = strtok(NULL, " ");

   if (!*p)
     statuscode = 200;
   else
   {
     for (i = 0; i < strlen(p); i++)
       if (!isdigit(p[i]))
         return 908;
   }
   statuscode = atoi(p);

   p = strtok(NULL, "\0");
   if (p)
     reason = p;

   if (statuscode == 200)
     statuscode = 0;

   if (!type)
     return statuscode;


/* Read HTTP response header (until 0 or empty line, max BUFSIZE bytes  */

   rlen = 0;
   plf = FALSE;

   if (TRACE)
     printf("*** Read responseheader\n");

   for ( cp = buf, *buf = 0; 1; cp++ )
   {
      alarm (timeout);
      rc = read ( server, cp, 1 ); 
      alarm (0);

      if ( rc < 0 )
	 return 900;

      if ( *cp == '\r')
      {
        cp--;
        continue;
      }

      rlen++;

      if ( *cp == 0 || rlen > BUFSIZE)
	 return 909;
      else if ( *cp == '\n')
      {
         if (plf)
         {
           *cp = 0;
	   break;
         }
         else
           plf = TRUE;
      }
      else
        plf = FALSE;
   }

/*  Write rest of HTTP header                                         */

   if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) )
   {
     write(fd, buf, strlen(buf));
     write(fd, "\n", 1);
   }

   if (TRACE)
      printf("*** read() : (%d) %s\n", rlen, buf);


/* Parse header for Content-Type and Loaction                         */

   rc = 904;

   p = strtok(buf, "\n");
   while (p)
   {
     if (strncasecmp(p, "Content-Type:", 13) == 0)
     {
       p += 13;
       while (p[0] == ' ')
         p++;
       strcpy(conType, p);

       if (strncasecmp(p, "text/html", 7) == 0)
       {
         if ( ( type == TYPE_HTMLHEAD) || (type == TYPE_MARC ) )
           htmlonly = 1;
         rc = 0;
       }

       else if (strncasecmp(p, "application/marc", 16) == 0)
       {
         if ( type == TYPE_MARC )
         {
           marcrecord = 1; 
           rc = 0;
         }
       }
     }

     else if (strncasecmp(p, "Location:", 9) == 0)
     {
       p += 9;
       while (p[0] == ' ')
         p++;
       strcpy(loc, p);
     }
     p = strtok(NULL, "\n");
   }


/* All OK. Socket is positioned at start of HTTP Entity-Body          */

   if (rc)
     return rc;
   else
     return statuscode;
}





/* ------------------------------------------------------------------- */
/* htget  : Fetch a URL                                                */
/* ------------------------------------------------------------------- */

int htget(char *iurl, int type, int timeout, char *outfile, char *h_contype, char *h_location)
{
   int i, rc, fd, soc, port;
   struct sockaddr_in addr; 
   struct hostent *hp, *gethostbyname();
   char	uurl[1024], *url = uurl, hostname[256], cport[64], req[1024];
   char *h, *p, *q, *r;
   char tfile[256], blank[2];

   *blank = *h_contype = *h_location = 0; 
   htmlonly = marcrecord = 0;

   strcpy(tfile, "/tmp/geturl.tmp");
   if (!*outfile)
     outfile = (char *) tfile;

   if (!*iurl || ( strlen(iurl) > 1023 ) )
     return 901;

   strcpy(url, iurl);

/* Parse and validate URL                                             */

   if (strncmp(url, "http://", 7) != 0)
     return 901;

   url += 7;

   q = strtok(url, "/");
   if (!q)
     return 901;

   r = strtok(NULL, "\0");
   if (!r)
     r = (char *) blank;

   h = strtok(q, ":");
   if (!*h)
     return 901;

   strcpy(hostname, h);

   p = strtok(NULL, "\0");

   if (!p || !*p)
     port = 80;
   else
   {
     for (i = 0; i < strlen(p); i++)
       if (!isdigit(p[i]))
         return 901;
     port = atoi(p);
   }
   sprintf(req, "GET /%s HTTP/1.0\r\nUser_Agent: %s\n\n", r, AGENT);
  

/* Establish handler for the alarm-signal                              */

   signal (SIGALRM, thandler);


/* Get IP address                                                      */
 
   hp = gethostbyname(hostname);
   if (!hp)
     return 902;

/* Get socket and connect                                              */
 
   soc = socket(AF_INET, SOCK_STREAM, 0);
 
   addr.sin_family = AF_INET;
   
   memcpy( &addr.sin_addr.s_addr, hp->h_addr, (size_t) hp->h_length );

   addr.sin_port = htons(port);
   if (connect(soc, (struct sockaddr *) &addr, sizeof(struct sockaddr_in)) < 0)
      return 903;


/* Open the destination file                                           */
 
   if (type)
   {
     fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC , 0666);
     if (fd < 0)  
     {
        close(soc);
        return 910;
     }
   }




/* Write HTTP-request                                               */
 
   if (!writeRequest(req, soc))
   {
     close(soc);
     if (type)
       close(fd);
     return 999;
   }


/* Read header part of response                                     */

   rc = getHeader(soc, timeout, type, fd, r, h_location);
   
   if ( ( rc == 904 ) && (type < 4 ) )
     rc = 0;

   if ( ( rc == 0 ) && ( type > 1 ) )
     rc = getBody(soc, timeout, fd, outfile);
 
   close(soc);
   if (type)
     close(fd);
   strcpy(h_contype, conType);

   return rc;
}
