"Fossies" - the Fresh Open Source Software Archive

Member "atop-2.8.1/atopgpud" (7 Jan 2023, 19360 Bytes) of package /linux/misc/atop-2.8.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "atopgpud": 2.7.0_vs_2.7.1.

    1 #!/usr/bin/python3 -Es
    2 
    3 # ==============================================================
    4 # Daemon that gathers statistical information from all 
    5 # Nvidia GPUs in the current system. Every second, it gathers 
    6 # the statistics of every GPU and maintains cumulative counters,
    7 # globally and per process.
    8 #
    9 # Client processes can connect to this daemon on TCP port 59123.
   10 # Clients can send requests of two bytes, consisting of one byte
   11 # request code followed by one byte integer version number.
   12 # The request code can be 'T' to obtain the GPU types or 'S' to
   13 # obtain all statistical counters.
   14 # The response of the daemon starts with a 4-byte integer. The
   15 # first byte is the version of the response format and the 
   16 # subsequent three bytes indicate the length (big endian) of the
   17 # response string that follows. See the formatters for the layout
   18 # of the response string, later on in this source code.
   19 #
   20 # Dependencies: pip/pip3 install nvidia-ml-py
   21 #
   22 # This program can be executed by python2 or python3 (just change
   23 # the first line of this source file).
   24 # --------------------------------------------------------------
   25 # Author: Gerlof Langeveld
   26 # Date:   July 2018 (initial)
   27 #
   28 # Copyright (C) 2018 Gerlof Langeveld
   29 #
   30 # This program is free software; you can redistribute it and/or modify it
   31 # under the terms of the GNU General Public License as published by the
   32 # Free Software Foundation; either version 2, or (at your option) any
   33 # later version.
   34 #
   35 # This program is distributed in the hope that it will be useful, but
   36 # WITHOUT ANY WARRANTY; without even the implied warranty of
   37 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   38 # See the GNU General Public License for more details.
   39 #
   40 # You should have received a copy of the GNU General Public License
   41 # along with this program; if not, write to the Free Software
   42 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   43 # ==============================================================
   44 
   45 import os
   46 import sys
   47 import time
   48 import socket
   49 import struct
   50 import logging
   51 import logging.handlers as loghand
   52 import threading
   53 
   54 GPUDPORT     = 59123        # TCP port number server
   55 
   56 COMPUTE      = 1        # task support bit value
   57 ACCOUNT      = 2        # task support bit value
   58 
   59 # =================================
   60 # GPU related bookkeeping
   61 # =================================
   62 gpulist = []            # list with per-GPU bookkeeping
   63 cliterm = {}            # dict with one entry per client (client
   64                 # socket as key), that contains a dict with
   65                 # the terminated per-process bookkeepings
   66                 # that still have to be received by this client
   67                                 # (pid of terminated process as key)
   68 
   69 gpulock = threading.Lock()  # mutex for access to gpulist/cliterm
   70 
   71 
   72 # =================================
   73 # per-GPU class
   74 # =================================
   75 class Stats(object): pass       # generic statistics container
   76 
   77 class GpuProp(object):
   78     ###############################
   79     # initialization method to setup
   80     # properties
   81     ###############################
   82     def __init__(self, num):
   83         gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(num)
   84         pciInfo   = pynvml.nvmlDeviceGetPciInfo(gpuhandle)
   85 
   86         self.gpuhandle         = gpuhandle
   87         self.stats             = Stats()
   88 
   89         self.stats.busid       = pciInfo.busId.decode('ascii', errors='replace')
   90         self.stats.devname     = pynvml.nvmlDeviceGetName(gpuhandle).decode(
   91                                    'ascii', errors='replace').replace(' ', '_')
   92 
   93         self.stats.tasksupport = 0      # process stats support
   94 
   95         try:
   96             procinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(gpuhandle)
   97             self.stats.tasksupport |= COMPUTE   # compute support
   98         except Exception:
   99             pass                # no compute support
  100 
  101         try:
  102             pynvml.nvmlDeviceSetAccountingMode(gpuhandle, True)
  103             pynvml.nvmlDeviceSetPersistenceMode(gpuhandle, True)  # NVIDIA advise
  104             self.stats.tasksupport |= ACCOUNT   # account support
  105         except Exception as e:
  106             pass
  107 
  108         self.stats.gpupercnow   = 0 # perc of time that GPU was busy
  109         self.stats.mempercnow   = 0 # perc of time that memory was rd/wr
  110 
  111         self.stats.memtotalnow  = 0 # in Kb
  112         self.stats.memusednow   = 0 # in Kb
  113 
  114         self.stats.gpusamples   = 0
  115         self.stats.gpuperccum   = 0 # perc of time that GPU was busy
  116         self.stats.memperccum   = 0 # perc of time that memory was rd/wr
  117         self.stats.memusedcum   = 0 # in KiB
  118  
  119         self.stats.procstats    = {}    # stats of active processes (key = pid)
  120 
  121     ###############################
  122     # method to fetch counters and values
  123     ###############################
  124     def readstats(self):
  125         self.stats.gpusamples += 1
  126 
  127         # -----------------------------
  128         # get rates (utilization percentages)
  129         # -----------------------------
  130         try:
  131             rates = pynvml.nvmlDeviceGetUtilizationRates(self.gpuhandle)
  132 
  133             self.stats.gpupercnow  = rates.gpu
  134             self.stats.mempercnow  = rates.memory
  135             self.stats.gpuperccum += rates.gpu
  136             self.stats.memperccum += rates.memory
  137         except pynvml.NVMLError as err:
  138             self.stats.gpupercnow  = -1
  139             self.stats.mempercnow  = -1
  140             self.stats.gpuperccum  = -1
  141             self.stats.memperccum  = -1
  142 
  143         # -----------------------------
  144         # get memory occupation GPU-wide
  145         # -----------------------------
  146         try:
  147             meminfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpuhandle)
  148 
  149             self.stats.memtotalnow = meminfo.total // 1024
  150             self.stats.memusednow  = meminfo.used  // 1024
  151             self.stats.memusedcum += meminfo.used  // 1024  # in KiB
  152         except pynvml.NVMLError as err:
  153             pass
  154 
  155         # -----------------------------
  156         # get per-process statistics
  157         # -----------------------------
  158         try:
  159             procinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(
  160                                                         self.gpuhandle)
  161 
  162             # -------------------------
  163             # build list with pids from
  164             # the previous interval
  165             # -------------------------
  166             actprocs = list(self.stats.procstats.keys())
  167 
  168             # -------------------------
  169             # handle proc stats of this
  170             # interval
  171             # -------------------------
  172             for proc in procinfo:
  173                 pid = proc.pid
  174 
  175                 # ---------------------
  176                 # new process? 
  177                 #     create new stats 
  178                 # ---------------------
  179                 if pid not in actprocs:
  180                     self.stats.procstats[pid] = Stats()
  181 
  182                     self.stats.procstats[pid].memnow  = 0   # in KiB
  183                     self.stats.procstats[pid].memcum  = 0   # in KiB
  184                     self.stats.procstats[pid].sample  = 0
  185 
  186                     self.stats.procstats[pid].gpubusy = -1
  187                     self.stats.procstats[pid].membusy = -1
  188                     self.stats.procstats[pid].timems  = -1
  189                 else:
  190                     actprocs.remove(pid)
  191 
  192                 # ---------------------
  193                 # maintain proc stats
  194                 # ---------------------
  195                 if proc.usedGpuMemory:
  196                     self.stats.procstats[pid].memnow  = proc.usedGpuMemory//1024
  197                     self.stats.procstats[pid].memcum += proc.usedGpuMemory//1024
  198                     self.stats.procstats[pid].sample += 1
  199 
  200                 if self.stats.tasksupport & ACCOUNT:
  201                     try:
  202                         stats = pynvml.nvmlDeviceGetAccountingStats(self.gpuhandle, pid)
  203 
  204                         self.stats.procstats[pid].gpubusy = stats.gpuUtilization
  205                         self.stats.procstats[pid].membusy = stats.memoryUtilization
  206                         self.stats.procstats[pid].timems  = stats.time
  207                     except Exception:
  208                         pass
  209 
  210             # -------------------------
  211             # determine which processes
  212             # have terminated since 
  213             # previous sample
  214             # -------------------------
  215             for pid in actprocs:
  216                  for client in cliterm:
  217                      cliterm[client][pid] = self.stats.procstats[pid]
  218 
  219                  del self.stats.procstats[pid]
  220 
  221         except pynvml.NVMLError as err:
  222             pass
  223 
  224 
  225     ###############################
  226     # obtain current statistics
  227     ###############################
  228     def getstats(self):
  229         return self.stats
  230 
  231 
  232 # =================================
  233 # Main function
  234 # =================================
  235 def main():
  236     # -----------------------------
  237     # initialize GPU access,
  238     # specifically to detect of it 
  239     # succeeds
  240     # -----------------------------
  241     try:
  242         pynvml.nvmlInit()
  243     except Exception:
  244         logging.error("Shared lib 'libnvidia-ml' probably not installed!")
  245         sys.exit()
  246 
  247     # -----------------------------
  248     #   open IPv6 stream socket 
  249     # -----------------------------
  250     try:
  251         mysock = socket.socket(socket.AF_INET6,  socket.SOCK_STREAM, 0)
  252     except Exception as sockex:
  253         try:
  254             mysock = socket.socket(socket.AF_INET,  socket.SOCK_STREAM, 0)
  255         except Exception as sockex:
  256             logging.error("Socket creation fails")
  257             sys.exit(1)
  258 
  259     # -----------------------------
  260     #   bind to local port and
  261     #   make socket passive
  262     # -----------------------------
  263     try:
  264         mysock.bind( ("", GPUDPORT) )
  265         mysock.listen(32)
  266     except Exception as sockex:
  267         logging.error("Socket binding to port %d fails", GPUDPORT)
  268         sys.exit(1)
  269 
  270     # -----------------------------
  271     # release parent process
  272     # (daemonize)
  273     # -----------------------------
  274     try:
  275         if os.fork():
  276             sys.exit(0) # parent process exits; child continues...
  277     except Exception:
  278         logging.error("Failed to fork child")
  279 
  280     # -----------------------------
  281     # initialize GPU access for the
  282     # child process
  283     # -----------------------------
  284     try:
  285         pynvml.nvmlInit()
  286     except Exception:
  287         pass
  288 
  289     # -----------------------------
  290     # determine number of GPUs in
  291     # this system
  292     # -----------------------------
  293     gpunum = pynvml.nvmlDeviceGetCount()
  294     logging.info("Number of GPUs: %d", gpunum)
  295 
  296     if gpunum == 0:
  297         logging.info("Terminated (no GPUs available)")
  298         sys.exit() 
  299 
  300     # -----------------------------
  301     # initialize per-GPU bookkeeping
  302     # -----------------------------
  303     for i in range(gpunum):
  304         gpulist.append( GpuProp(i) )
  305 
  306     # -----------------------------
  307     # kick off new thread to fetch
  308     # statistics periodically
  309     # -----------------------------
  310     t = threading.Thread(target=gpuscanner, args=(1,))
  311     t.daemon = True
  312     t.start()
  313 
  314     logging.info("Initialization succeeded")
  315 
  316     # -----------------------------
  317     # main thread:
  318     #   await connect of client
  319     # -----------------------------
  320     while True:
  321         newsock, peeraddr = mysock.accept()
  322 
  323         # -------------------------
  324         # create new thread to
  325         # serve this client
  326         # -------------------------
  327         t = threading.Thread(target=serveclient, args=(newsock, peeraddr))
  328         t.daemon = True
  329         t.start()
  330 
  331  
  332 # ===========================================
  333 # Thread start function:
  334 # Serve new client that has just
  335 # connected.
  336 #
  337 # -------------------------------------------
  338 # Protocol between client and server:
  339 #
  340 # - client transmits request 
  341 #   consisting of two bytes
  342 #
  343 #     byte 0: type of request
  344 #             'S' get statistical counters
  345 #             'T' get type of each GPU
  346 #
  347 #     byte 1: integer version number
  348 #             response layout might change
  349 #             so the client asks for a
  350 #             specific response version
  351 #
  352 # - server transmits response 
  353 #   consisting of a four bytes integer
  354 #   in big endian byte order
  355 #
  356 #     byte 0:   version number, preferably
  357 #               as requested by the client
  358 #
  359 #     byte 1-3: length of the response string
  360 #               that follows
  361 #
  362 #   followed by the response string that is
  363 #   version specific (see gpuformatters)
  364 # ===========================================
  365 def serveclient(sock, peer):
  366     # -----------------------------
  367     # create per client bookkeeping
  368     # for terminated processes
  369     # -----------------------------
  370     with gpulock:
  371         cliterm[sock] = {}
  372 
  373     # -----------------------------
  374     # main loop
  375     # -----------------------------
  376     while True:
  377         # -------------------------
  378         # wait for request
  379         # -------------------------
  380         try:
  381             rcvbuf = sock.recv(20)
  382         except Exception as sockex:
  383             logging.error("Receive error: %s", sockex)
  384             sock.close()
  385             break
  386 
  387         # -------------------------
  388         # connection closed by peer?
  389         # -------------------------
  390         if not rcvbuf:  
  391             sock.close()
  392             break
  393 
  394         logging.debug("Received: %s", rcvbuf)
  395 
  396         # -------------------------
  397         # request has wrong length?
  398         # -------------------------
  399         if len(rcvbuf) != 2:
  400             logging.error('Wrong request length: %d', len(rcvbuf))
  401             sock.close()
  402             break
  403 
  404         # -------------------------
  405         # valid request:
  406         #     get statistical counters?
  407         # -------------------------
  408         try:
  409             command = chr(rcvbuf[0])    # Python3
  410             version = rcvbuf[1]
  411         except Exception:
  412             command = rcvbuf[0]     # Python2
  413             version = ord(rcvbuf[1])
  414 
  415         if command == 'S':
  416              if version == 0 or version >= len(gpuformatters):
  417                  version = len(gpuformatters)-1
  418 
  419              xmitbuf = gpuformatters[version](sock).encode('ascii',
  420                                                     errors='replace')
  421 
  422         # -------------------------
  423         # valid request:
  424         #     get GPU types?
  425         # -------------------------
  426         elif command == 'T':        
  427              if version == 0 or version >= len(gpudevnames):
  428                  version = len(gpudevnames)-1
  429 
  430              xmitbuf = gpudevnames[version]().encode('ascii', errors='replace')
  431 
  432         # -------------------------
  433         # invalid request!
  434         # -------------------------
  435         else:
  436             logging.error('Wrong request from client: %s', command)
  437             sock.close()
  438             break
  439 
  440         # -------------------------
  441         # transmit GPU statistics
  442         # as bytes
  443         # -------------------------
  444         logging.debug("Send: %s", xmitbuf)
  445 
  446         prelude = struct.pack(">I", (version << 24) + len(xmitbuf))
  447 
  448         try:
  449             sock.send(prelude)
  450             sock.send(xmitbuf)
  451         except Exception as sockex:
  452             logging.error("Send error: %s", sockex)
  453             sock.close()
  454             break
  455 
  456     # -----------------------------
  457     # delete per client bookkeeping
  458     # of terminated processes
  459     # -----------------------------
  460     with gpulock:
  461         del cliterm[sock]
  462 
  463     # -----------------------------
  464     # END OF CLIENT THREAD
  465     # -----------------------------
  466 
  467 
  468 # =================================
  469 # Generate sequence of device names
  470 # =================================
  471 def gpudevname_v1():
  472     # -----------------------------
  473     # main loop:
  474     # - get device name of every GPU
  475     # - convert into one string
  476     #   with format:
  477     #      numgpus@busid devname tasksupport@busid devname tasksupport@...
  478     # -----------------------------
  479     strbuf = str( len(gpulist) )
  480 
  481     with gpulock:
  482         for i, gpu in enumerate(gpulist):
  483             s = gpu.getstats()
  484             strbuf += "@{:s} {:s} {:d}".format(
  485                                     s.busid, s.devname, s.tasksupport)
  486 
  487     return strbuf
  488 
  489 gpudevnames = [None, gpudevname_v1]
  490 
  491 
  492 # =================================
  493 # Convert statistics of all GPUs
  494 # into parseable string
  495 # =================================
  496 def gpuformatter_v1(clisock):
  497     # -----------------------------
  498     # main loop:
  499     # - get statistics for every GPU
  500     # - convert stats to one string
  501     #   with format:
  502     #      numgpus@gpu0 stats#pid stats#pid stats@gpu1 stats#pid stats@....
  503     # -----------------------------
  504     strbuf = ""
  505 
  506     with gpulock:
  507         for i, gpu in enumerate(gpulist):
  508             s = gpu.getstats()
  509 
  510             # ---------------------
  511             # generic GPU stats
  512             # ---------------------
  513             strbuf += "@{:d} {:d} {:d} {:d} {:d} {:d} {:d} {:d}".format(
  514                        s.gpupercnow,  s.mempercnow,
  515                        s.memtotalnow, s.memusednow, s.gpusamples,
  516                        s.gpuperccum,  s.memperccum, s.memusedcum);
  517 
  518             # ---------------------
  519             # active processes for
  520             # this GPU
  521             # ---------------------
  522             for pid, stat in s.procstats.items():
  523                 strbuf += "#A {:d} {:d} {:d} {:d} {:d} {:d} {:d}".format(pid,
  524                            stat.gpubusy, stat.membusy, stat.timems,
  525                            stat.memnow,  stat.memcum,  stat.sample)
  526 
  527             # ---------------------
  528             # terminated processes
  529             # for this GPU
  530             # ---------------------
  531             for pid, stat in cliterm[clisock].items():
  532                 strbuf += "#E {:d} {:d} {:d} {:d} {:d} {:d} {:d}".format(pid,
  533                            stat.gpubusy, stat.membusy, stat.timems,
  534                            stat.memnow,  stat.memcum,  stat.sample)
  535 
  536             cliterm[clisock].clear()
  537 
  538 
  539     return strbuf
  540 
  541 
  542 gpuformatters = [None, gpuformatter_v1]
  543 
  544 
  545 # =================================
  546 # Thread start function:
  547 # Scan all GPUs with a particular
  548 # interval to obtain their stats
  549 # =================================
  550 def gpuscanner(scaninterval):
  551     # -----------------------------
  552     # main loop:
  553     # - get statistics for every GPU
  554     # - sleep for interval
  555     # -----------------------------
  556     while True:
  557         with gpulock:
  558             for gpu in gpulist:
  559                 gpu.readstats()
  560 
  561         time.sleep(scaninterval)
  562 
  563 # ==========================================================================
  564 
  565 # -----------------------------
  566 # initialize logging
  567 # -----------------------------
  568 if '-v' in sys.argv:
  569     loglevel = logging.DEBUG
  570 else:
  571     loglevel = logging.INFO
  572 
  573 fm = logging.Formatter('atopgpud %(levelname)s: %(message)s')
  574 
  575 fh = loghand.SysLogHandler('/dev/log', 
  576              facility=loghand.SysLogHandler.LOG_DAEMON)
  577 fh.setFormatter(fm)
  578 fh.setLevel(loglevel)
  579 
  580 lg = logging.getLogger()        # root logger
  581 lg.addHandler(fh)
  582 lg.setLevel(loglevel)
  583 
  584 # -----------------------------
  585 # load module pynvml
  586 # -----------------------------
  587 try:
  588     import pynvml
  589 except Exception:
  590     logging.error("Python module 'pynvml' not installed!")
  591     sys.exit(1)
  592 
  593 
  594 try:
  595    # -----------------------------
  596    # call main function
  597    # -----------------------------
  598    main()
  599 
  600 finally:
  601     # -----------------------------
  602     # shutdown GPU access
  603     # -----------------------------
  604     try:
  605         pynvml.nvmlShutdown()
  606     except Exception:
  607         pass