"Fossies" - the Fresh Open Source Software Archive

Member "atop-2.8.1/gpucom.c" (7 Jan 2023, 13110 Bytes) of package /linux/misc/atop-2.8.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "gpucom.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.7.1_vs_2.8.0.

    1 /*
    2 ** ATOP - System & Process Monitor
    3 **
    4 ** The program 'atop' offers the possibility to view the activity of
    5 ** the system on system-level as well as process-level.
    6 **
    7 ** This source-file contains functions to interface with the atopgpud
    8 ** daemon that maintains statistics about the processor and memory
    9 ** utilization of the GPUs.
   10 ** ================================================================
   11 ** Author:      Gerlof Langeveld
   12 ** E-mail:      gerlof.langeveld@atoptool.nl
   13 ** Initial:     April/August 2018
   14 **
   15 ** This program is free software; you can redistribute it and/or modify it
   16 ** under the terms of the GNU General Public License as published by the
   17 ** Free Software Foundation; either version 2, or (at your option) any
   18 ** later version.
   19 **
   20 ** This program is distributed in the hope that it will be useful, but
   21 ** WITHOUT ANY WARRANTY; without even the implied warranty of
   22 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   23 ** See the GNU General Public License for more details.
   24 */
   25 
   26 #include <sys/types.h>
   27 #include <sys/socket.h>
   28 #include <netinet/in.h>
   29 #include <string.h>
   30 #include <stdio.h>
   31 #include <unistd.h>
   32 #include <stdlib.h>
   33 #include <malloc.h>
   34 
   35 #include "atop.h"
   36 #include "photosyst.h"
   37 #include "photoproc.h"
   38 #include "gpucom.h"
   39 
   40 #define DUMMY       ' '
   41 #define GPUDELIM    '@'
   42 #define PIDDELIM    '#'
   43 
   44 #define GPUDPORT    59123
   45 
   46 static void gputype_parse(char *);
   47 
   48 static void gpustat_parse(int, char *, int,
   49                               struct pergpu *, struct gpupidstat *);
   50 static void gpuparse(int, char *, struct pergpu *);
   51 static void pidparse(int, char *, struct gpupidstat *);
   52 static int  rcvuntil(int, char *, int);
   53 
   54 static int  actsock = -1;
   55 
   56 static int  numgpus;
   57 static char **gpubusid; // array with char* to busid strings
   58 static char **gputypes; // array with char* to type strings
   59 static char *gputasks;  // array with chars with tasksupport booleans
   60 
   61 /*
   62 ** Open TCP connection to port of atopgpud and
   63 ** obtain type information of every GPU.
   64 **
   65 ** Return value:
   66 **  number of GPUs
   67 */
   68 int
   69 gpud_init(void)
   70 {
   71     struct sockaddr_in  name;
   72     socklen_t       namelen = sizeof name;
   73     char            typereq[] = {'T', APIVERSION};
   74     uint32_t        prelude;
   75     char            *buf;
   76     int         version, length;
   77 
   78     struct timeval      rcvtimeout = {2, 0};    // 2 seconds
   79 
   80     /*
   81     ** get local socket
   82     */
   83     if ( (actsock = socket(AF_INET, SOCK_STREAM, 0)) == -1)
   84     {
   85         perror("socket creation");
   86         return 0;
   87     }
   88 
   89     /*
   90     ** connect to server port
   91     */
   92     memset(&name, 0, sizeof name);
   93     name.sin_family      = AF_INET;
   94     name.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   95     name.sin_port        = htons(GPUDPORT);
   96 
   97     if (connect(actsock, (struct sockaddr *)&name, namelen) == -1)
   98         goto close_and_return;
   99 
  100     /*
  101     ** set receive timeout, not to block atop forever
  102     ** in case something fails in the commmunication
  103     */
  104     (void) setsockopt(actsock, SOL_SOCKET, SO_RCVTIMEO,
  105                     &rcvtimeout, sizeof rcvtimeout);
  106 
  107     /*
  108     ** send request: GPU types
  109     */
  110     if ( write(actsock, typereq, sizeof typereq) < sizeof typereq)
  111     {
  112         perror("send type request to atopgpud");
  113         goto close_and_return;
  114     }
  115 
  116     /*
  117     ** receive response: GPU types
  118     */
  119     if (rcvuntil(actsock, (char *)&prelude, sizeof prelude) == -1)
  120     {
  121         perror("receive prelude from atopgpud");
  122         goto close_and_return;
  123     }
  124 
  125     prelude = ntohl(prelude);   // big endian to native endianess
  126 
  127     version = (prelude >> 24) & 0xff;
  128     length  =  prelude        & 0xffffff;
  129 
  130     if (version != APIVERSION)
  131     {
  132         fprintf(stderr,
  133             "wrong API version from atopgpud: %d %d\n",
  134             version, APIVERSION);
  135 
  136         goto close_and_return;
  137     }
  138  
  139     if (length > 8192)  // sanity check
  140     {
  141         fprintf(stderr,
  142             "unexpected response length atopgpud: %d\n", length);
  143 
  144         goto close_and_return;
  145     }
  146 
  147     buf = malloc(length+1);
  148     ptrverify(buf, "Malloc failed for gpu rcvbuf\n");
  149 
  150     if ( rcvuntil(actsock, buf, length) == -1)
  151     {
  152         perror("receive type request from atopgpud");
  153         goto close_and_return;
  154     }
  155 
  156     buf[length] = '\0';
  157 
  158     gputype_parse(buf);
  159 
  160         numgpus = numgpus <= MAXGPU ? numgpus : MAXGPU;
  161 
  162     return numgpus;
  163 
  164     close_and_return:
  165     close(actsock);
  166     actsock = -1;
  167     return 0;
  168 }
  169 
  170 
  171 /*
  172 ** Transmit status request for all GPUs.
  173 **
  174 ** Calling parameters:
  175 **  void
  176 **
  177 ** Return value:
  178 **  0 in case of failure
  179 **  1 in case of success
  180 */
  181 int
  182 gpud_statrequest(void)
  183 {
  184     char    statreq[] = {'S', APIVERSION};
  185 
  186     if (actsock == -1)
  187         return 0;
  188 
  189     if ( write(actsock, statreq, sizeof statreq) < sizeof statreq)
  190     {
  191         close(actsock);
  192         actsock = -1;
  193         return 0;
  194     }
  195 
  196     return 1;
  197 }
  198 
  199 
  200 /*
  201 ** Receive status response for all GPUs.
  202 **
  203 ** Calling parameters:
  204 **  *ggs    pointer to allocated array of pergpu structs
  205 **  **gps   pointer to pointer in which addresses to gpupidstat structs
  206 **      are returned
  207 **      can be NULL pointer is caller is not interested in proc stats
  208 **
  209 ** Return value:
  210 **  number of gpupidstat addresses (i.e. per-process info)
  211 **  -1 in case of failure
  212 */
  213 int
  214 gpud_statresponse(int maxgpu, struct pergpu *ggs, struct gpupidstat **gps)
  215 {
  216     uint32_t    prelude;
  217     char        *buf = NULL, *p;
  218     int     version, length;
  219     int     pids = 0;
  220 
  221     if (actsock == -1)
  222         return -1;
  223 
  224     /*
  225     ** receive 4-bytes introducer:
  226     **  first byte:     API version
  227     **  next three bytes:   length of string that follows
  228     */
  229     if ( rcvuntil(actsock, (char *)&prelude, sizeof prelude) == -1)
  230     {
  231         perror("receive 4-byte prelude from atopgpud");
  232         goto close_and_return;
  233     }
  234 
  235     prelude = ntohl(prelude);   // big endian to native endianess
  236 
  237     version = (prelude >> 24) & 0xff;
  238     length  =  prelude        & 0xffffff;
  239 
  240     if (version != APIVERSION)
  241     {
  242         fprintf(stderr,
  243             "wrong API version from atopgpud: %d %d\n",
  244             version, APIVERSION);
  245 
  246         goto close_and_return;
  247     }
  248  
  249     if (length > 8192)  // sanity check
  250     {
  251         fprintf(stderr,
  252             "unexpected response length atopgpud: %d\n", length);
  253 
  254         goto close_and_return;
  255     }
  256 
  257     buf = malloc(length+1);
  258     ptrverify(buf, "Malloc failed for gpu rcvbuf\n");
  259 
  260     /*
  261     ** receive statistics string
  262     */
  263     if ( rcvuntil(actsock, buf, length) == -1)
  264     {
  265         perror("receive stats string from atopgpud");
  266         goto close_and_return;
  267     }
  268 
  269     *(buf+length) = '\0';
  270 
  271     /*
  272     ** determine number of per-process stats
  273     ** and malloc space to parse these stats
  274     */
  275     for (p=buf; *p; p++)
  276     {
  277         if (*p == PIDDELIM)
  278             pids++;
  279     }
  280 
  281     if (gps)
  282     {
  283         if (pids)
  284         {
  285             *gps = malloc(pids * sizeof(struct gpupidstat));
  286             ptrverify(gps, "Malloc failed for gpu pidstats\n");
  287             memset(*gps, 0, pids * sizeof(struct gpupidstat));
  288         }
  289         else
  290         {
  291             *gps = NULL;
  292         }
  293     }
  294 
  295     /*
  296     ** parse stats string for per-gpu stats
  297     */
  298     gpustat_parse(version, buf, maxgpu, ggs, gps ? *gps : NULL);
  299 
  300     free(buf);
  301 
  302     return pids;
  303 
  304     close_and_return:
  305     if (buf)
  306         free(buf);
  307 
  308     close(actsock);
  309     actsock = -1;
  310     return -1;
  311 }
  312 
  313 
  314 /*
  315 ** Receive given number of bytes from given socket
  316 ** into given buffer address
  317 */
  318 static int
  319 rcvuntil(int sock, char *buf, int size)
  320 {
  321     int remain = size, n;
  322 
  323     while (remain)
  324     {
  325         n = read(sock, buf, remain);
  326 
  327         if (n <= 0)
  328             return -1;
  329 
  330         buf     += n;
  331         remain  -= n;
  332     }
  333 
  334     return size;
  335 }
  336 
  337 /*
  338 ** Parse response string from server on 'T' request
  339 **
  340 ** Store the type, busid and tasksupport of every GPU in
  341 ** static pointer tables
  342 */
  343 static void
  344 gputype_parse(char *buf)
  345 {
  346     char    *p, *start, **bp, **tp, *cp;
  347 
  348     /*
  349     ** determine number of GPUs
  350     */
  351     if ( sscanf(buf, "%d@", &numgpus) != 1)
  352     {
  353         close(actsock);
  354         actsock = -1;
  355         return;
  356     }
  357 
  358     for (p=buf; *p; p++)    // search for first delimiter
  359     {
  360         if (*p == GPUDELIM)
  361         {
  362             p++;
  363             break;
  364         }
  365     }
  366 
  367     /*
  368     ** parse GPU info and build arrays of pointers to the
  369     ** busid strings, type strings and tasksupport strings.
  370     */
  371     if (numgpus)            // GPUs present anyhow?
  372     {
  373         int field;
  374 
  375         gpubusid = bp = malloc((numgpus+1) * sizeof(char *));
  376         gputypes = tp = malloc((numgpus+1) * sizeof(char *));
  377         gputasks = cp = malloc((numgpus)   * sizeof(char  ));
  378 
  379         ptrverify(gpubusid, "Malloc failed for gpu busids\n");
  380         ptrverify(gputypes, "Malloc failed for gpu types\n");
  381         ptrverify(gputasks, "Malloc failed for gpu tasksup\n");
  382 
  383         for (field=0, start=p; ; p++)
  384         {
  385             if (*p == ' ' || *p == '\0' || *p == GPUDELIM)
  386             {
  387                 switch(field)
  388                 {
  389                    case 0:
  390                     if (p-start <= MAXGPUBUS)
  391                         *bp++ = start;
  392                     else
  393                         *bp++ = p - MAXGPUBUS;
  394                     break;
  395                    case 1:
  396                     if (p-start <= MAXGPUTYPE)
  397                         *tp++ = start;
  398                     else
  399                         *tp++ = p - MAXGPUTYPE;
  400                     break;
  401                    case 2:
  402                     *cp++ = *start;
  403                     break;
  404                 }
  405 
  406                 field++;
  407                 start = p+1;
  408 
  409                 if (*p == '\0')
  410                     break;
  411 
  412                 if (*p == GPUDELIM)
  413                     field = 0;
  414 
  415                 *p = '\0';
  416             }
  417         }
  418 
  419         *bp = NULL;
  420         *tp = NULL;
  421     }
  422 }
  423 
  424 
  425 /*
  426 ** Parse entire response string from server.
  427 **
  428 ** Every series with counters on GPU level is introduced
  429 ** with a '@' delimiter.
  430 ** Every series with counters on process level is introduced
  431 ** with a '#' delimiter (last part of the GPU level data).
  432 */
  433 static void
  434 gpustat_parse(int version, char *buf, int maxgpu, 
  435         struct pergpu *gg, struct gpupidstat *gp)
  436 {
  437     char    *p, *start, delimlast;
  438     int gpunum = 0;
  439 
  440     /*
  441     ** parse stats string
  442     */
  443     for (p=start=buf, delimlast=DUMMY; gpunum <= maxgpu; p++)
  444     {
  445         char delimnow;
  446 
  447         if (*p != '\0' && *p != GPUDELIM && *p != PIDDELIM)
  448             continue;
  449 
  450         /*
  451         ** next delimiter or end-of-string found
  452         */
  453         delimnow = *p;
  454         *p       = 0;
  455 
  456         switch (delimlast)
  457         {
  458            case DUMMY:
  459             break;
  460 
  461            case GPUDELIM:
  462             gpuparse(version, start, gg);
  463 
  464             strcpy(gg->type,  gputypes[gpunum]);
  465             strcpy(gg->busid, gpubusid[gpunum]);
  466 
  467             gpunum++;
  468             gg++;
  469             break;
  470 
  471            case PIDDELIM:
  472             if (gp)
  473             {
  474                 pidparse(version, start, gp);
  475 
  476                 gp->gpu.nrgpus++;
  477                 gp->gpu.gpulist = 1<<(gpunum-1);
  478                 gp++;
  479 
  480                 (gg-1)->nrprocs++;
  481             }
  482         }
  483 
  484         if (delimnow == 0 || *(p+1) == 0)
  485             break;
  486 
  487         start     = p+1;
  488         delimlast = delimnow;
  489     }
  490 }
  491 
  492 
  493 /*
  494 ** Parse GPU statistics string
  495 */
  496 static void
  497 gpuparse(int version, char *p, struct pergpu *gg)
  498 {
  499     switch (version)
  500     {
  501        case 1:
  502         (void) sscanf(p, "%d %d %lld %lld %lld %lld %lld %lld", 
  503             &(gg->gpupercnow), &(gg->mempercnow),
  504             &(gg->memtotnow),  &(gg->memusenow),
  505             &(gg->samples),    &(gg->gpuperccum),
  506             &(gg->memperccum), &(gg->memusecum));
  507 
  508         gg->nrprocs = 0;
  509 
  510         break;
  511     }
  512 }
  513 
  514 
  515 /*
  516 ** Parse PID statistics string
  517 */
  518 static void
  519 pidparse(int version, char *p, struct gpupidstat *gp)
  520 {
  521     switch (version)
  522     {
  523        case 1:
  524         (void) sscanf(p, "%c %ld %d %d %lld %lld %lld %lld",
  525             &(gp->gpu.state),   &(gp->pid),    
  526             &(gp->gpu.gpubusy), &(gp->gpu.membusy),
  527             &(gp->gpu.timems),
  528             &(gp->gpu.memnow), &(gp->gpu.memcum),
  529                 &(gp->gpu.sample));
  530         break;
  531     }
  532 }
  533 
  534 
  535 /*
  536 ** Merge the GPU per-process counters with the other
  537 ** per-process counters
  538 */
  539 static int compgpupid(const void *, const void *);
  540 
  541 void
  542 gpumergeproc(struct tstat      *curtpres, int ntaskpres,
  543              struct tstat      *curpexit, int nprocexit,
  544              struct gpupidstat *gpuproc,  int nrgpuproc)
  545 {
  546     struct gpupidstat   **gpp;
  547     int             t, g, gpuleft = nrgpuproc;
  548 
  549     /*
  550     ** make pointer list for elements in gpuproc
  551     */
  552     gpp = malloc(nrgpuproc * sizeof(struct gpupidstat *));
  553 
  554     if (!gpp)
  555         ptrverify(gpp, "Malloc failed for process list\n");
  556 
  557     for (g=0; g < nrgpuproc; g++)
  558         gpp[g] = gpuproc + g;
  559 
  560     /*
  561     ** sort the list with pointers in order of pid
  562     */
  563     if (nrgpuproc > 1)
  564             qsort(gpp, nrgpuproc, sizeof(struct gpupidstat *), compgpupid);
  565 
  566     /*
  567     ** accumulate entries that contain stats from same PID
  568     ** on different GPUs
  569     */
  570     for (g=1; g < nrgpuproc; g++)
  571     {
  572         if (gpp[g-1]->pid == gpp[g]->pid)
  573         {
  574             struct gpupidstat *p = gpp[g-1], *q = gpp[g];
  575 
  576             p->gpu.nrgpus  += q->gpu.nrgpus;
  577             p->gpu.gpulist |= q->gpu.gpulist;
  578 
  579             if (p->gpu.gpubusy != -1)
  580                 p->gpu.gpubusy += q->gpu.gpubusy;
  581 
  582             if (p->gpu.membusy != -1)
  583                 p->gpu.membusy += q->gpu.membusy;
  584 
  585             if (p->gpu.timems != -1)
  586                 p->gpu.timems += q->gpu.timems;
  587 
  588             p->gpu.memnow += q->gpu.memnow;
  589             p->gpu.memcum += q->gpu.memcum;
  590             p->gpu.sample += q->gpu.sample;
  591 
  592             if (nrgpuproc-g-1 > 0)
  593                 memmove(&(gpp[g]), &(gpp[g+1]),
  594                     (nrgpuproc-g-1) * sizeof p);
  595 
  596             nrgpuproc--;
  597             g--;
  598         }
  599     }
  600 
  601     /*
  602     ** merge gpu stats with sorted task list of active processes
  603     */
  604     for (t=g=0; t < ntaskpres && g < nrgpuproc; t++)
  605     {
  606         if (curtpres[t].gen.isproc)
  607         {
  608             if (curtpres[t].gen.pid == gpp[g]->pid)
  609             {
  610                 curtpres[t].gpu = gpp[g]->gpu;
  611                 gpp[g++] = NULL;
  612 
  613                 if (--gpuleft == 0 || g >= nrgpuproc)
  614                     break;
  615             }
  616 
  617             // anyhow resync
  618             while ( curtpres[t].gen.pid > gpp[g]->pid)
  619             {
  620                 if (++g >= nrgpuproc)
  621                     break;
  622             }
  623         }
  624     }
  625 
  626     if (gpuleft == 0)
  627     {
  628         free(gpp);
  629         return;
  630     }
  631 
  632     /*
  633     ** compact list with pointers to remaining pids
  634     */
  635     for (g=t=0; g < nrgpuproc; g++)
  636     {
  637         if (gpp[g] == NULL)
  638         {
  639             for (; t < nrgpuproc; t++)
  640             {
  641                 if (gpp[t])
  642                 {
  643                     gpp[g] = gpp[t];
  644                     gpp[t] = NULL;
  645                     break;
  646                 }
  647             }
  648         }
  649     }
  650 
  651     /*
  652     ** merge remaining gpu stats with task list of exited processes
  653     */
  654     for (t=0; t < nprocexit && gpuleft; t++)
  655     {
  656         if (curpexit[t].gen.isproc)
  657         {
  658             for (g=0; g < gpuleft; g++)
  659             {
  660                 if (gpp[g] == NULL)
  661                     continue;
  662 
  663                 if (curpexit[t].gen.pid == gpp[g]->pid)
  664                 {
  665                     curpexit[t].gpu = gpp[g]->gpu;
  666                     gpp[g] = NULL;
  667                     gpuleft--;
  668                 }
  669             }
  670         }
  671     }
  672 
  673     free(gpp);
  674 }
  675 
  676 static int
  677 compgpupid(const void *a, const void *b)
  678 {
  679     return (*((struct gpupidstat **)a))->pid - (*((struct gpupidstat **)b))->pid;
  680 }