"Fossies" - the Fresh Open Source Software Archive

Member "citadel/modules/rssclient/serv_rssclient.c" (5 Jun 2021, 11799 Bytes) of package /linux/www/citadel.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "serv_rssclient.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 9.01_vs_902.

    1 /*
    2  * Bring external RSS and/or Atom feeds into rooms.  This module implements a
    3  * very loose parser that scrapes both kinds of feeds and is not picky about
    4  * the standards compliance of the source data.
    5  *
    6  * Copyright (c) 2007-2021 by the citadel.org team
    7  *
    8  * This program is open source software; you can redistribute it and/or
    9  * modify it under the terms of the GNU General Public License version 3.
   10  *
   11  * This program is distributed in the hope that it will be useful,
   12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14  * GNU General Public License for more details.
   15  */
   16 
   17 #include <stdlib.h>
   18 #include <unistd.h>
   19 #include <stdio.h>
   20 #include <time.h>
   21 #include <ctype.h>
   22 #include <string.h>
   23 #include <errno.h>
   24 #include <sys/types.h>
   25 #include <sys/stat.h>
   26 #include <expat.h>
   27 #include <curl/curl.h>
   28 #include <libcitadel.h>
   29 #include "citadel.h"
   30 #include "server.h"
   31 #include "citserver.h"
   32 #include "support.h"
   33 #include "config.h"
   34 #include "threads.h"
   35 #include "ctdl_module.h"
   36 #include "msgbase.h"
   37 #include "parsedate.h"
   38 #include "database.h"
   39 #include "citadel_dirs.h"
   40 #include "context.h"
   41 #include "internet_addressing.h"
   42 
   43 struct rssroom {
   44     struct rssroom *next;
   45     char *room;
   46 };
   47 
   48 struct rssurl {
   49     struct rssurl *next;
   50     char *url;
   51     struct rssroom *rooms;
   52 };
   53 
   54 struct rssparser {
   55     StrBuf *CData;
   56     struct CtdlMessage *msg;
   57     char *link;
   58     char *description;
   59     char *item_id;
   60     struct rssroom *rooms;
   61 };
   62 
   63 time_t last_run = 0L;
   64 struct rssurl *rsstodo = NULL;
   65 
   66 
   67 // This handler is called whenever an XML tag opens.
   68 //
   69 void rss_start_element(void *data, const char *el, const char **attribute) {
   70     struct rssparser *r = (struct rssparser *)data;
   71     int i;
   72 
   73     if (server_shutting_down) return;           // shunt the whole operation if we're exiting
   74 
   75     if (
   76         (!strcasecmp(el, "entry"))
   77         || (!strcasecmp(el, "item"))
   78     ) {
   79         // this is the start of a new item(rss) or entry(atom)
   80         if (r->msg != NULL) {
   81             CM_Free(r->msg);
   82             r->msg = NULL;
   83         }
   84         r->msg = malloc(sizeof(struct CtdlMessage));
   85         memset(r->msg, 0, sizeof(struct CtdlMessage));
   86         r->msg->cm_magic = CTDLMESSAGE_MAGIC;
   87         r->msg->cm_anon_type = MES_NORMAL;
   88         r->msg->cm_format_type = FMT_RFC822;
   89     }
   90 
   91     else if (!strcasecmp(el, "link")) {         // atom feeds have the link as an attribute
   92         for(i = 0; attribute[i]; i += 2) {
   93             if (!strcasecmp(attribute[i], "href")) {
   94                 if (r->link != NULL) {
   95                     free(r->link);
   96                     r->link = NULL;
   97                 }
   98                 r->link = strdup(attribute[i+1]);
   99                 striplt(r->link);
  100             }
  101         }
  102     }
  103 }
  104 
  105 
  106 // This handler is called whenever an XML tag closes.
  107 //
  108 void rss_end_element(void *data, const char *el) {
  109     struct rssparser *r = (struct rssparser *)data;
  110     StrBuf *encoded_field;
  111 
  112     if (server_shutting_down) return;           // shunt the whole operation if we're exiting
  113 
  114     if (StrLength(r->CData) > 0) {              // strip leading/trailing whitespace from field
  115         StrBufTrim(r->CData);
  116     }
  117 
  118     if (                            // end of a new item(rss) or entry(atom)
  119         (!strcasecmp(el, "entry"))
  120         || (!strcasecmp(el, "item"))
  121     ) {
  122         if (r->msg != NULL) {               // Save the message to the rooms
  123 
  124             // use the link as an item id if nothing else is available
  125             if ((r->item_id == NULL) && (r->link != NULL)) {
  126                 r->item_id = strdup(r->link);
  127             }
  128 
  129             // check the use table
  130             StrBuf *u = NewStrBuf();
  131             StrBufAppendPrintf(u, "rss/%s", r->item_id);
  132             int already_seen = CheckIfAlreadySeen(u);
  133             FreeStrBuf(&u);
  134 
  135             if (already_seen == 0) {
  136 
  137                 // Compose the message text
  138                 StrBuf *TheMessage = NewStrBuf();
  139                 StrBufAppendPrintf(TheMessage,
  140                     "Content-type: text/html\n\n"
  141                     "\n\n"
  142                     "<html><head></head><body>"
  143                 );
  144         
  145                 if (r->description != NULL) {
  146                     StrBufAppendPrintf(TheMessage, "%s<br><br>\r\n", r->description);
  147                     free(r->description);
  148                     r->description = NULL;
  149                 }
  150         
  151                 if (r->link != NULL) {
  152                     StrBufAppendPrintf(TheMessage, "<a href=\"%s\">%s</a>\r\n", r->link, r->link);
  153                     free(r->link);
  154                     r->link = NULL;
  155                 }
  156     
  157                 StrBufAppendPrintf(TheMessage, "</body></html>\r\n");
  158                 CM_SetField(r->msg, eMesageText, ChrPtr(TheMessage), StrLength(TheMessage));
  159                 FreeStrBuf(&TheMessage);
  160     
  161                 if (CM_IsEmpty(r->msg, eAuthor)) {
  162                     CM_SetField(r->msg, eAuthor, HKEY("rss"));
  163                 }
  164     
  165                 if (CM_IsEmpty(r->msg, eTimestamp)) {
  166                     CM_SetFieldLONG(r->msg, eTimestamp, time(NULL));
  167                 }
  168     
  169                 // Save it to the room(s)
  170                 struct rssroom *rr = NULL;
  171                 long msgnum = (-1);
  172                 for (rr=r->rooms; rr!=NULL; rr=rr->next) {
  173                     if (rr == r->rooms) {
  174                         msgnum = CtdlSubmitMsg(r->msg, NULL, rr->room);     // in first room, save msg
  175                     }
  176                     else {
  177                         CtdlSaveMsgPointerInRoom(rr->room, msgnum, 0, NULL);    // elsewhere, save a pointer
  178                     }
  179                     syslog(LOG_DEBUG, "rssclient: saved message %ld to %s", msgnum, rr->room);
  180                 }
  181             }
  182             else {
  183                 syslog(LOG_DEBUG, "rssclient: already seen %s", r->item_id);
  184             }
  185     
  186             CM_Free(r->msg);
  187             r->msg = NULL;
  188         }
  189 
  190         if (r->item_id != NULL) {
  191             free(r->item_id);
  192             r->item_id = NULL;
  193         }
  194     }
  195 
  196     else if (!strcasecmp(el, "title")) {            // item subject (rss and atom)
  197         if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eMsgSubject))) {
  198             encoded_field = NewStrBuf();
  199             StrBufRFC2047encode(&encoded_field, r->CData);
  200             CM_SetAsFieldSB(r->msg, eMsgSubject, &encoded_field);
  201         }
  202     }
  203 
  204     else if (!strcasecmp(el, "creator")) {          // <creator> can be used if <author> is not present
  205         if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eAuthor))) {
  206             encoded_field = NewStrBuf();
  207             StrBufRFC2047encode(&encoded_field, r->CData);
  208             CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field);
  209         }
  210     }
  211 
  212     else if (!strcasecmp(el, "author")) {           // <author> supercedes <creator> if both are present
  213         if (r->msg != NULL) {
  214             encoded_field = NewStrBuf();
  215             StrBufRFC2047encode(&encoded_field, r->CData);
  216             CM_SetAsFieldSB(r->msg, eAuthor, &encoded_field);
  217         }
  218     }
  219 
  220     else if (!strcasecmp(el, "pubdate")) {          // date/time stamp (rss) Sat, 25 Feb 2017 14:28:01 EST
  221         if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
  222             CM_SetFieldLONG(r->msg, eTimestamp, parsedate(ChrPtr(r->CData)));
  223         }
  224     }
  225 
  226     else if (!strcasecmp(el, "updated")) {          // date/time stamp (atom) 2003-12-13T18:30:02Z
  227         if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
  228             struct tm t;
  229             char zulu;
  230             memset(&t, 0, sizeof t);
  231             sscanf(ChrPtr(r->CData), "%d-%d-%dT%d:%d:%d%c", &t.tm_year, &t.tm_mon, &t.tm_mday, &t.tm_hour, &t.tm_min, &t.tm_sec, &zulu);
  232             t.tm_year -= 1900;
  233             t.tm_mon -= 1;
  234             CM_SetFieldLONG(r->msg, eTimestamp, mktime(&t));
  235         }
  236     }
  237 
  238     else if (!strcasecmp(el, "link")) {         // link to story (rss)
  239         if (r->link != NULL) {
  240             free(r->link);
  241             r->link = NULL;
  242         }
  243         r->link = strdup(ChrPtr(r->CData));
  244     }
  245 
  246     else if (
  247         (!strcasecmp(el, "guid"))           // unique item id (rss)
  248         || (!strcasecmp(el, "id"))          // unique item id (atom)
  249     ) {
  250         if (r->item_id != NULL) {
  251             free(r->item_id);
  252             r->item_id = NULL;
  253         }
  254         r->item_id = strdup(ChrPtr(r->CData));
  255     }
  256 
  257     else if (
  258         (!strcasecmp(el, "description"))        // message text (rss)
  259         || (!strcasecmp(el, "summary"))         // message text (atom)
  260         || (!strcasecmp(el, "content"))         // message text (atom)
  261     ) {
  262         if (r->description != NULL) {
  263             free(r->description);
  264             r->description = NULL;
  265         }
  266         r->description = strdup(ChrPtr(r->CData));
  267     }
  268 
  269     if (r->CData != NULL) {
  270         FreeStrBuf(&r->CData);
  271         r->CData = NULL;
  272     }
  273 }
  274 
  275 
  276 // This handler is called whenever data appears between opening and closing tags.
  277 //
  278 void rss_handle_data(void *data, const char *content, int length)
  279 {
  280     struct rssparser *r = (struct rssparser *)data;
  281 
  282     if (r->CData == NULL) {
  283         r->CData = NewStrBuf();
  284     }
  285 
  286     StrBufAppendBufPlain(r->CData, content, length, 0);
  287 }
  288 
  289 
  290 // Feed has been downloaded, now parse it.
  291 //
  292 void rss_parse_feed(StrBuf *Feed, struct rssroom *rooms)
  293 {
  294     struct rssparser r;
  295 
  296     memset(&r, 0, sizeof r);
  297     r.rooms = rooms;
  298     XML_Parser p = XML_ParserCreate("UTF-8");
  299     XML_SetElementHandler(p, rss_start_element, rss_end_element);
  300     XML_SetCharacterDataHandler(p, rss_handle_data);
  301     XML_SetUserData(p, (void *)&r);
  302     XML_Parse(p, ChrPtr(Feed), StrLength(Feed), XML_TRUE);
  303     XML_ParserFree(p);
  304 }
  305 
  306 
  307 // Add a feed/room pair into the todo list
  308 //
  309 void rssclient_push_todo(char *rssurl, char *roomname)
  310 {
  311     struct rssurl *r = NULL;
  312     struct rssurl *thisone = NULL;
  313     struct rssroom *newroom = NULL;
  314 
  315     syslog(LOG_DEBUG, "rssclient: will fetch %s to %s", rssurl, roomname);
  316 
  317     for (r=rsstodo; r!=NULL; r=r->next) {
  318         if (!strcasecmp(r->url, rssurl)) {
  319             thisone = r;
  320         }
  321     }
  322 
  323     if (thisone == NULL) {
  324         thisone = malloc(sizeof(struct rssurl));
  325         thisone->url = strdup(rssurl);
  326         thisone->rooms = NULL;
  327         thisone->next = rsstodo;
  328         rsstodo = thisone;
  329     }
  330 
  331     newroom = malloc(sizeof(struct rssroom));
  332     newroom->room = strdup(roomname);
  333     newroom->next = thisone->rooms;
  334     thisone->rooms = newroom;
  335 }
  336 
  337 
  338 // pull one feed (possibly multiple rooms)
  339 //
  340 void rss_pull_one_feed(struct rssurl *url)
  341 {
  342     CURL *curl;
  343     CURLcode res;
  344     StrBuf *Downloaded = NULL;
  345 
  346     syslog(LOG_DEBUG, "rssclient: fetching %s", url->url);
  347 
  348     curl = curl_easy_init();
  349     if (!curl) {
  350         return;
  351     }
  352 
  353     Downloaded = NewStrBuf();
  354 
  355     curl_easy_setopt(curl, CURLOPT_URL, url->url);
  356     curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
  357     curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
  358     curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);         // Follow redirects
  359     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, CurlFillStrBuf_callback); // What to do with downloaded data
  360     curl_easy_setopt(curl, CURLOPT_WRITEDATA, Downloaded);          // Give it our StrBuf to work with
  361     curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);               // Time out after 20 seconds
  362     res = curl_easy_perform(curl);                      // Perform the request
  363     if (res != CURLE_OK) {
  364         syslog(LOG_WARNING, "rssclient: failed to load feed: %s", curl_easy_strerror(res));
  365     }
  366     curl_easy_cleanup(curl);
  367 
  368     rss_parse_feed(Downloaded, url->rooms);                 // parse the feed
  369     FreeStrBuf(&Downloaded);                        // free the downloaded feed data
  370 }
  371 
  372 
  373 // We have a list, now download the feeds
  374 //
  375 void rss_pull_feeds(void)
  376 {
  377     struct rssurl *r;
  378     struct rssroom *rr;
  379 
  380     while ((rsstodo != NULL) && (!server_shutting_down)) {
  381         rss_pull_one_feed(rsstodo);
  382         r = rsstodo;
  383         rsstodo = rsstodo->next;
  384         while (r->rooms != NULL) {
  385             rr = r->rooms;
  386             r->rooms = r->rooms->next;
  387             free(rr->room);
  388             free(rr);
  389         }
  390         free(r->url);
  391         free(r);
  392     }
  393 }
  394 
  395 
  396 // Scan a room's netconfig looking for RSS feed parsing requests
  397 //
  398 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data)
  399 {
  400     char *serialized_config = NULL;
  401     int num_configs = 0;
  402     char cfgline[SIZ];
  403     int i = 0;
  404 
  405     if (server_shutting_down) return;
  406 
  407         serialized_config = LoadRoomNetConfigFile(qrbuf->QRnumber);
  408         if (!serialized_config) {
  409         return;
  410     }
  411 
  412     num_configs = num_tokens(serialized_config, '\n');
  413     for (i=0; i<num_configs; ++i) {
  414         extract_token(cfgline, serialized_config, i, '\n', sizeof cfgline);
  415         if (!strncasecmp(cfgline, HKEY("rssclient|"))) {
  416             strcpy(cfgline, &cfgline[10]);
  417             char *vbar = strchr(cfgline, '|');
  418             if (vbar != NULL) {
  419                 *vbar = 0;
  420             }
  421             rssclient_push_todo(cfgline, qrbuf->QRname);
  422         }
  423     }
  424 
  425     free(serialized_config);
  426 }
  427 
  428 
  429 /*
  430  * Scan for rooms that have RSS client requests configured
  431  */
  432 void rssclient_scan(void) {
  433     time_t now = time(NULL);
  434 
  435     /* Run no more than once every 15 minutes. */
  436     if ((now - last_run) < 900) {
  437         syslog(LOG_DEBUG,
  438             "rssclient: polling interval not yet reached; last run was %ldm%lds ago",
  439             ((now - last_run) / 60),
  440             ((now - last_run) % 60)
  441         );
  442         return;
  443     }
  444 
  445     syslog(LOG_DEBUG, "rssclient: started");
  446     CtdlForEachRoom(rssclient_scan_room, NULL);
  447     rss_pull_feeds();
  448     syslog(LOG_DEBUG, "rssclient: ended");
  449     last_run = time(NULL);
  450     return;
  451 }
  452 
  453 
  454 CTDL_MODULE_INIT(rssclient)
  455 {
  456     if (!threading)
  457     {
  458         syslog(LOG_INFO, "rssclient: using %s", curl_version());
  459         CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER, PRIO_AGGR + 300);
  460     }
  461     return "rssclient";
  462 }