libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

html_extractor.c File Reference

plugin to support HTML files More...

#include "platform.h"
#include "extractor.h"
#include <magic.h>
#include <tidy/tidy.h>
#include <tidy/tidybuffio.h>
Include dependency graph for html_extractor.c:

Go to the source code of this file.

Functions

static enum EXTRACTOR_MetaType tag_to_type (const char *tag)
 
static Bool TIDY_CALL report_cb (TidyDoc doc, TidyReportLevel lvl, uint line, uint col, ctmbstr mssg)
 
static int TIDY_CALL get_byte_cb (void *sourceData)
 
static void TIDY_CALL unget_byte_cb (void *sourceData, byte bt)
 
static Bool TIDY_CALL eof_cb (void *sourceData)
 
void EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
 
void html_gobject_init ()
 
void html_ltdl_fini ()
 

Variables

struct {
   const char *   name
 
   enum EXTRACTOR_MetaType   type
 
tagmap []
 
static magic_t magic
 

Detailed Description

plugin to support HTML files

Author
Christian Grothoff

Definition in file html_extractor.c.

Function Documentation

◆ eof_cb()

static Bool TIDY_CALL eof_cb ( void *  sourceData)
static

Input callback: check for EOF.

Parameters
sourceDataour 'struct EXTRACTOR_ExtractContext'
Returns
true if we are at the EOF

Definition at line 161 of file html_extractor.c.

References EXTRACTOR_ExtractContext::cls, EXTRACTOR_ExtractContext::get_size, and EXTRACTOR_ExtractContext::seek.

Referenced by EXTRACTOR_html_extract_method().

◆ EXTRACTOR_html_extract_method()

void EXTRACTOR_html_extract_method ( struct EXTRACTOR_ExtractContext ec)

◆ get_byte_cb()

static int TIDY_CALL get_byte_cb ( void *  sourceData)
static

Input callback: get next byte of input.

Parameters
sourceDataour 'struct EXTRACTOR_ExtractContext'
Returns
next byte of input, EndOfStream on errors and EOF

Definition at line 126 of file html_extractor.c.

References EXTRACTOR_ExtractContext::cls, and EXTRACTOR_ExtractContext::read.

Referenced by EXTRACTOR_html_extract_method().

◆ html_gobject_init()

void html_gobject_init ( )

Initialize glib and load magic file.

Definition at line 678 of file html_extractor.c.

References magic, and NULL.

◆ html_ltdl_fini()

void html_ltdl_fini ( )

Destructor for the library, cleans up.

Definition at line 692 of file html_extractor.c.

References magic, and NULL.

◆ report_cb()

static Bool TIDY_CALL report_cb ( TidyDoc  doc,
TidyReportLevel  lvl,
uint  line,
uint  col,
ctmbstr  mssg 
)
static

Function called by libtidy for error reporting.

Parameters
doctidy doc being processed
lvlreport level
lineinput line
colinput column
mssgmessage
Returns
FALSE (no output)

Definition at line 109 of file html_extractor.c.

Referenced by EXTRACTOR_html_extract_method().

◆ tag_to_type()

static enum EXTRACTOR_MetaType tag_to_type ( const char *  tag)
static

Map 'meta' tag to LE type.

Parameters
tagtag to map
Returns
EXTRACTOR_METATYPE_RESERVED if the type was not found

Definition at line 76 of file html_extractor.c.

Referenced by EXTRACTOR_html_extract_method().

◆ unget_byte_cb()

static void TIDY_CALL unget_byte_cb ( void *  sourceData,
byte  bt 
)
static

Input callback: unget last byte of input.

Parameters
sourceDataour 'struct EXTRACTOR_ExtractContext'
btbyte to unget (ignored)

Definition at line 146 of file html_extractor.c.

References EXTRACTOR_ExtractContext::cls, and EXTRACTOR_ExtractContext::seek.

Referenced by EXTRACTOR_html_extract_method().

Variable Documentation

◆ magic

magic_t magic
static

Global handle to MAGIC data.

Definition at line 76 of file html_extractor.c.

Referenced by EXTRACTOR_html_extract_method(), html_gobject_init(), and html_ltdl_fini().

◆ name

const char* name

◆ 

struct { ... } tagmap[]
Initial value:
= {
{ "dc.title", EXTRACTOR_METATYPE_TITLE},
{ "description", EXTRACTOR_METATYPE_DESCRIPTION },
{ "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
{ "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
{ "publisher", EXTRACTOR_METATYPE_PUBLISHER },
{ "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
{ "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
{ "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
{ "language", EXTRACTOR_METATYPE_LANGUAGE },
{ "keywords", EXTRACTOR_METATYPE_KEYWORDS },
{ "abstract", EXTRACTOR_METATYPE_ABSTRACT },
{ "dc.creator", EXTRACTOR_METATYPE_CREATOR},
{ "dc.identifier", EXTRACTOR_METATYPE_URI },
{ "dc.format", EXTRACTOR_METATYPE_FORMAT },
}
#define NULL
Definition: getopt1.c:60
@ EXTRACTOR_METATYPE_UNKNOWN_DATE
Definition: extractor.h:195
@ EXTRACTOR_METATYPE_URI
Definition: extractor.h:162
@ EXTRACTOR_METATYPE_FORMAT
Definition: extractor.h:190
@ EXTRACTOR_METATYPE_AUTHOR_NAME
Definition: extractor.h:143
@ EXTRACTOR_METATYPE_LANGUAGE
Definition: extractor.h:157
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE
Definition: extractor.h:194
@ EXTRACTOR_METATYPE_CREATOR
Definition: extractor.h:189
@ EXTRACTOR_METATYPE_ABSTRACT
Definition: extractor.h:186
@ EXTRACTOR_METATYPE_KEYWORDS
Definition: extractor.h:185
@ EXTRACTOR_METATYPE_COPYRIGHT
Definition: extractor.h:183
@ EXTRACTOR_METATYPE_RIGHTS
Definition: extractor.h:184
@ EXTRACTOR_METATYPE_PUBLISHER
Definition: extractor.h:146
@ EXTRACTOR_METATYPE_SUBJECT
Definition: extractor.h:188
@ EXTRACTOR_METATYPE_RESERVED
Definition: extractor.h:128
@ EXTRACTOR_METATYPE_DESCRIPTION
Definition: extractor.h:182

Mapping of HTML META names to LE types.

◆ type