"Fossies" - the Fresh Open Source Software archive 
Member "netrik-1.16.1/url.c" of archive netrik-1.16.1.tar.gz:
/*
netrik -- The ANTRIK Internet Viewer
Copyright (C) Olaf D. Buddenhagen AKA antrik, et al (see AUTHORS)
Published under the GNU GPL; see LICENSE for details.
*/
/*
* url.c -- URL handling functions
*
* (C) 2001, 2002 antrik
* 2002 Patrice Neff
*/
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "debug.h"
#include "url.h"
static void dump_url(const struct Url *url); /* print components of a split URL */
static void store_component(const char *start, const char *end, char **component); /* store a URL field extracted by split_url() */
static void str_append(char **str, const char *append); /* append string to end of other string, reallocating memory */
/* print components of a split URL */
static void dump_url(url)
const struct Url *url;
{
debug_printf(" full url: %s\n", url->full_url);
debug_printf(" full path: %s\n", url->path);
debug_printf(" protocol: %s\n", url->proto.str);
debug_printf(" host: %s\n", url->host);
debug_printf(" port: %d\n", url->port);
debug_printf(" directory: %s\n", url->dir);
debug_printf(" file name: %s\n", url->name);
debug_printf(" parameters: %s\n", url->params);
debug_printf(" fragment identifier: %s\n", url->frag);
}
/* store a section representing one URL field extracted by split_url()
* to the respective string */
void store_component(start, end, component)
const char *start;
const char *end;
char **component; /* where to store */
{
const int len=end-start;
if(len>0) {
*component=malloc(len+1);
if(component==NULL) {
fprintf(stderr, "memory allocation error while parsing URL (in function store_component)\n");
exit(1);
}
strncpy(*component, start, len);
(*component)[len]='\0';
} else
*component=NULL;
}
/* parse URL (split URL string into fields) */
struct Url *split_url(url)
const char *url;
{
struct Url *components;
char *url_char; /* current position in URL string */
char *word_start; /* starting position of currently parsed url field */
char *name_start; /* start of file name (component of path after last '/') */
/* mode to parse next char in (<<8 to allow simple combining with "dispatch_char") */
enum {
PM_START=0x000,
PM_SLASH=0x100, /* '/' at beginning of URL */
PM_PROTO=0x200, /* inside protocoll specification */
PM_HOST_START1=0x300, /* first '/' of "//" introcucing host */
PM_HOST_START2=0x400,
PM_HOST=0x500, /* inside host name */
PM_PORT=0x600, /* inside port number */
PM_PATH=0x700, /* inside directory/name string */
PM_PARA=0x800, /* inside CGI parameter list */
PM_FRAG=0x900, /* inside fragment identifier */
PM_END=0xa00 /* '\0' ending URL string */
} parse_mode=PM_START;
char escaped_url[3*strlen(url)+1]; /* enough for worst case of all characters needing escaping */
DMSG(("parsing URL...\n"));
components=malloc(sizeof(struct Url));
if(components==NULL) {
fprintf(stderr, "memory allocation error while parsing URL\n");
exit(1);
}
/* set default values */
{
const struct Url def={NULL, NULL, {NULL, PT_UNKNOWN}, NULL, 0, NULL, NULL, NULL, NULL, 0, 0};
memcpy(components, &def, sizeof(def));
components->proto.str=strdup(""); /* must be present when setting PT_INTERNAL due to error */
if(components->proto.str==NULL) {
fprintf(stderr, "memory allocation error while splitting URL\n");
exit(1);
}
}
/* Escape invalid chars in URL. */
{
char *src_pos;
char *write_pos;
for(src_pos=(char *)url, write_pos=escaped_url; *src_pos; ++src_pos) { /* all chars in original URL */
const unsigned char chr=*src_pos;
if(isascii(chr) && isgraph(chr) && chr!='\'') /* normal char (see hacking-load.* for explanation!) */
write_pos+=snprintf(write_pos, sizeof(escaped_url) - (write_pos-escaped_url), "%c", chr); /* just store char */
else /* needs to be escaped */
write_pos+=snprintf(write_pos, sizeof(escaped_url) - (write_pos-escaped_url), "%%%.2x", (int)chr); /* store %hh */
}
*write_pos=0;
}
for(name_start=word_start=url_char=escaped_url; parse_mode!=PM_END; ++url_char) { /* all chars in URL string */
int dispatch_char; /* input character after modification for dispatch */
int dispatch; /* combined state variable used to determine action in main dispatcher */
int recycle=0; /* need additional dispatch pass (sometimes necessary after switching parsing mode */
DMSG(("%c", *url_char));
if(strchr(":/?#", *url_char)==NULL) /* normal character */
dispatch_char='$'; /* indicated by '$' (shared dispatch case) */
else /* characters which may have special function */
dispatch_char=*url_char; /* handled each seperately in dispatcher */
do { /* while recycle (additional pass necessary) */
recycle=0;
dispatch=dispatch_char|parse_mode; /* dispatch on new char and current state */
switch(dispatch) {
/* URL starts with normal char -> assume it is a protocol specification */
case '$'|PM_START:
parse_mode=PM_PROTO;
/* normal protocol string char -> go ahead */
case '$'|PM_PROTO:
break;
/* URL starts with '/' */
case '/'|PM_START:
name_start=url_char+1; /* might be path start... (see '/'|PM_PATH below) */
parse_mode=PM_SLASH;
break;
/* URL start with host ("//") */
case '/'|PM_SLASH:
parse_mode=PM_HOST_START2; /* host will follow */
recycle=1;
break;
/* URL starts with path (with single '/') -> skip to path parsing */
case '$'|PM_SLASH:
case '?'|PM_SLASH:
case '#'|PM_SLASH:
case '\0'|PM_SLASH:
/* URL starts neither with '/' nor with word -> also skip to path parsing (which will skip to next component) */
case '?'|PM_START:
case '#'|PM_START:
case '\0'|PM_START:
/* URL starts with word, but first word doesn't end with ':' => it was not the protocol specification -> also skip to path parsing */
case '/'|PM_PROTO:
case '?'|PM_PROTO:
case '#'|PM_PROTO:
case '\0'|PM_PROTO:
parse_mode=PM_PATH;
recycle=1;
break;
/* protocol specification end */
case ':'|PM_PROTO:
free(components->proto.str);
store_component(word_start, url_char, &components->proto.str);
/* test for known protocols */
if(components->proto.str!=NULL) {
if(strcasecmp(components->proto.str, "http")==0)
components->proto.type=PT_HTTP;
else if(strcasecmp(components->proto.str, "ftp")==0)
components->proto.type=PT_FTP;
else if(strcasecmp(components->proto.str, "file")==0)
components->proto.type=PT_FILE;
else {
fprintf(stderr, "unknown protocol %s\n", components->proto.str);
components->proto.type=PT_INTERNAL;
return components;
}
} else
components->proto.type=PT_UNKNOWN;
parse_mode=PM_HOST_START1;
break;
/* first '/' of "//" introducing host specification */
case '/'|PM_HOST_START1:
parse_mode=PM_HOST_START2;
break;
/* second '/' */
case '/'|PM_HOST_START2:
parse_mode=PM_HOST;
word_start=url_char+1; /* host name starts after the "//" */
break;
/* host name char */
case '$'|PM_HOST:
break;
/* host name end */
case ':'|PM_HOST:
case '/'|PM_HOST:
case '?'|PM_HOST:
case '#'|PM_HOST:
case '\0'|PM_HOST:
store_component(word_start, url_char, &components->host);
parse_mode=PM_PORT;
if(*url_char==':') /* port number follows host name */
word_start=url_char+1;
else { /* something else than port number follows -> needs to take another look at the current char */
word_start=url_char;
recycle=1;
}
break;
/* port number char */
case '$'|PM_PORT:
break;
/* port number end */
case '/'|PM_PORT:
case '?'|PM_PORT:
case '#'|PM_PORT:
case '\0'|PM_PORT:
components->port=atoi(word_start);
if(components->port==0) /* no port specified -> use default port */
components->port=80;
name_start=word_start=url_char;
parse_mode=PM_PATH;
recycle=1; /* path start always needs additional pass (for setting "name_start" properly) */
break;
/* normal path char */
case '$'|PM_PATH:
case ':'|PM_PATH:
break;
/* '/' in path -> previous word wasn't file name, but the following one might be */
case '/'|PM_PATH:
name_start=url_char+1; /* file name starts after last '/' */
break;
/* path end */
case '?'|PM_PATH:
case '#'|PM_PATH:
case '\0'|PM_PATH:
if((url_char-name_start==1 && !strncmp(name_start, ".", 1)) || (url_char-name_start==2 && !strncmp(name_start, "..", 2))) { /* path ends in "." or ".." => this is *not* the file name, but a dir component! */
char save_char=*url_char;
*url_char='/'; /* let it end with a slash like a dir should... */
store_component(word_start, url_char+1, &components->dir); /* whole path, and the appended '/', are directory */
*url_char=save_char; /* need to restore the original char at this position, as it is important for further processing */
} else {
store_component(word_start, name_start, &components->dir); /* path ends at last '/' */
store_component(name_start, url_char, &components->name); /* file name is path component after last '/' */
}
word_start=url_char+1;
parse_mode=PM_PARA;
if(*url_char!='?')
recycle=1;
break;
/* parameter list char */
case '$'|PM_PARA:
case ':'|PM_PARA:
case '/'|PM_PARA: /* '/' and '?' treated as normal chars in parameter list */
case '?'|PM_PARA:
break;
/* parameter list end */
case '#'|PM_PARA:
case '\0'|PM_PARA:
store_component(word_start, url_char, &components->params);
word_start=url_char+1;
parse_mode=PM_FRAG;
if(*url_char!='#')
recycle=1;
break;
/* fragment identifier char */
case '$'|PM_FRAG:
break;
/* fragment identifier end == url end */
case '\0'|PM_FRAG:
store_component(word_start, url_char, &components->frag);
parse_mode=PM_END;
break;
default:
fprintf(stderr, "\nURL parse error (unexpected character)\n");
components->proto.type=PT_INTERNAL;
return components;
} /* switch(dispatch) */
} while(recycle); /* additional dispatch pass after parsing mode change */
} /* for all chars in URL string */
#ifdef DEBUG
if(cfg.debug) {
debug_printf("\nextracted URL components:\n");
dump_url(components);
}
#endif
return components;
}
/* free memory used by a parsed URL struct */
void free_url(url)
struct Url *url;
{
DMSG(("freeing url components...\n"));
if(url->full_url!=NULL)
free(url->full_url);
if(url->proto.str!=NULL)
free(url->proto.str);
if(url->host!=NULL)
free(url->host);
if(url->dir!=NULL)
free(url->dir);
if(url->name!=NULL)
free(url->name);
if(url->params!=NULL)
free(url->params);
if(url->frag!=NULL)
free(url->frag);
free(url);
}
/* append string to end of other string, reallocating memory;
* create new string, if str was NULL */
static void str_append(str, append)
char **str;
const char *append;
{
if(*str==NULL) { /* no string to append to -> create empty string */
*str=strdup("");
if(*str==NULL) {
fprintf(stderr, "memory allocation error while creating URL (in function str_append)\n");
exit(2);
}
}
*str=realloc(*str, sizeof(char[strlen(*str)+strlen(append)+1]));
if(*str==NULL) {
fprintf(stderr, "memory allocation error while creating URL (in function str_append)\n");
exit(2);
}
strcat(*str, append);
}
/*
* Merges two URLs to create a new one.
*
* The "url_string" is split into components, and "base_url" is used to
* complete the URL: Components are taken from "base_url" (or defaults, if no
* "base_url" supplied) until the first field is specified in the main URL;
* afterwards, all components are taken from the main URL (or defaults, if none
* supplied). If the main URL path is relative, it is merged with the base
* path.
*
* The "form" parameter gives an optional form data string which is to be
* submitted with the URL. (Method GET.) NULL means nothing to submit.
* Otherwise, the form data is attached as part of the merged URL.
*/
struct Url *merge_urls(base, url_string, form)
const struct Url *base; /* default values/base path */
const char *url_string; /* URL to be split and completed by "base_url" */
const char *form;
{
struct Url *url; /* merged URL */
struct Url *base_url;
struct Url *main_url;
base_url=(struct Url *)base;
#ifdef DEBUG
if(cfg.debug) {
if(base_url!=NULL) {
debug_printf("Base URL components:\n");
dump_url(base_url);
} else
debug_printf("No base URL.\n");
}
#endif
main_url=split_url(url_string);
if(main_url->proto.type==PT_INTERNAL) /* splitting failed */
fprintf(stderr, "can't get absolute target URL\n");
DMSG(("Merging...\n"));
/* alloc struct for merged URL */
url=malloc(sizeof(struct Url));
if(url==NULL) {
fprintf(stderr, "memory allocation error while parsing URL\n");
exit(1);
}
/* merge with fallback values from "base_url" */
if(main_url->proto.type!=PT_UNKNOWN) { /* some protocol set in "main_url" -> take it */
base_url=NULL; /* take following components from "main_url" also (use defaults if not supplied) */
url->proto.type=main_url->proto.type;
url->proto.str=strdup(main_url->proto.str);
} else { /* no protocol in "main_url" */
if(base_url!=NULL) { /* has "base_url" -> take fallback from it */
url->proto.type=base_url->proto.type;
url->proto.str=strdup(base_url->proto.str);
} else { /* no "base_url" -> use default */
url->proto.type=PT_UNKNOWN;
url->proto.str=strdup("");
}
}
if(base_url==NULL) /* absolute URL (no "base_url" was given, or "main_url" contained a protocol specification, so "base_url" isn't used) */
url->absolute=1;
else
url->absolute=0;
if(main_url->host!=NULL) {
base_url=NULL;
url->host=strdup(main_url->host);
url->port=main_url->port; /* port always set with host */
} else {
if(base_url!=NULL) { /* has "base_url" (and no components from "main_url" yet) */
url->host=strdup(base_url->host);
url->port=base_url->port;
} else {
url->host=strdup("");
url->port=80;
}
}
if(main_url->dir!=NULL) {
if(main_url->dir[0]=='/' || base_url==NULL) /* dir in "main_url" is absolute (or only one supplied) -> take it */
url->dir=strdup(main_url->dir);
else { /* dir in "main_url" is relative -> merge with "base_url" */
char merged_dir[strlen(base_url->dir)+strlen(main_url->dir)+1];
char *merge_part=main_url->dir;
strcpy(merged_dir, base_url->dir);
if(!strncmp(merge_part, "./", 2)) /* skip "./" */
merge_part+=2;
/* handle ".." */
for(; *merged_dir && !strncmp(merge_part, "../", 3); merge_part+=3) { /* relative path starts with "..", and there is anything left in original path -> remove last component */
char *trunc_pos;
/* find beginning of last component */
for(trunc_pos=strchr(merged_dir, 0)-1; trunc_pos>merged_dir && *(trunc_pos-1)!='/'; --trunc_pos); /* look for '/' ending previous component, or string start (skip last character, which is the '/' terminating last component) */
*trunc_pos=0;
} /* for all ".." */
strcat(merged_dir, merge_part); /* concatenate the part of the relative path remaining after skipping all the "./" and "../" */
url->dir=strdup(merged_dir);
}
base_url=NULL;
} else {
if(base_url!=NULL)
url->dir=strdup(base_url->dir);
else
url->dir=strdup(url->proto.type==PT_FILE || url->proto.type==PT_UNKNOWN?"":"/");
}
if(main_url->name!=NULL) {
base_url=NULL;
url->name=strdup(main_url->name);
} else {
if(base_url!=NULL)
url->name=strdup(base_url->name);
else
url->name=strdup("");
}
if(form!=NULL) { /* submit form data -> store in place of any other CGI paramters */
base_url=NULL; /* handle as if the data was passed as part of main URL (force loading new document even if URL otherwise identical; discard old fragment identifier) */
url->params=strdup(form);
} else if(main_url->params!=NULL) { /* no form to submit, but paramters given in main URL -> take them */
base_url=NULL;
url->params=strdup(main_url->params);
} else {
if(base_url!=NULL)
url->params=strdup(base_url->params);
else
url->params=strdup("");
}
if(base_url!=NULL) /* same document as "base", only fragment identifier may differ (nothing taken from "main_url" yet) */
url->local=1;
else
url->local=0;
if(main_url->frag!=NULL) {
base_url=NULL;
url->frag=strdup(main_url->frag);
} else {
if(base_url!=NULL)
url->frag=strdup(base_url->frag);
else
url->frag=strdup("");
}
if(url->proto.str==NULL
|| url->host==NULL
|| url->dir==NULL
|| url->name==NULL
|| url->params==NULL
|| url->frag==NULL
) {
fprintf(stderr, "memory allocation error while creating URL (in function merge_urls)\n");
exit(1);
}
free_url(main_url);
/* create "full_url" string (and set "path" pointer) */
{
int path_pos;
DMSG(("creating new URL...\n"));
url->full_url=NULL;
if(url->proto.type==PT_HTTP || url->proto.type==PT_FTP) {
str_append(&url->full_url, url->proto.str);
str_append(&url->full_url, "://");
}
str_append(&url->full_url, url->host);
if(url->port!=80) {
char port_str[7];
snprintf(port_str, sizeof(port_str), ":%d", url->port);
str_append(&url->full_url, port_str);
}
path_pos=strlen(url->full_url); /* path starts where "dir" and following components will be appended (present string end) */
str_append(&url->full_url, url->dir);
str_append(&url->full_url, url->name);
if(*url->params) {
str_append(&url->full_url, "?");
str_append(&url->full_url, url->params);
}
url->path=url->full_url+path_pos; /* store pointer to full path (can store pointer now, as full_url won't be moved anymore) */
}
#ifdef DEBUG
if(cfg.debug) {
debug_printf("merged URL components:\n");
dump_url(url);
}
#endif
return url;
}
/* Returns a string with %xx URL escape codes from the source string replaced by the respective characters. The result is newly allocated; the source remains untouched. */
char *url_unescape(escaped)
const char *escaped;
{
char *rpos, *wpos, chr;
char *result;
result=malloc(strlen(escaped)+1);
if(result==NULL) {
fprintf(stderr, "memory allocation error while unescaping path (in function url_unescape)\n");
exit(1);
}
wpos=result; rpos=(char *)escaped;
do {
if(*rpos=='%') {
int code;
sscanf(rpos+1, "%2x", &code);
chr=(unsigned char)code;
rpos+=3;
} else
chr=*rpos++;
} while((*wpos++=chr));
return result;
}