"Fossies" - the Fresh Open Source Software Archive 
Member "atop-2.8.1/atopgpud" (7 Jan 2023, 19360 Bytes) of package /linux/misc/atop-2.8.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
See also the last
Fossies "Diffs" side-by-side code changes report for "atopgpud":
2.7.0_vs_2.7.1.
1 #!/usr/bin/python3 -Es
2
3 # ==============================================================
4 # Daemon that gathers statistical information from all
5 # Nvidia GPUs in the current system. Every second, it gathers
6 # the statistics of every GPU and maintains cumulative counters,
7 # globally and per process.
8 #
9 # Client processes can connect to this daemon on TCP port 59123.
10 # Clients can send requests of two bytes, consisting of one byte
11 # request code followed by one byte integer version number.
12 # The request code can be 'T' to obtain the GPU types or 'S' to
13 # obtain all statistical counters.
14 # The response of the daemon starts with a 4-byte integer. The
15 # first byte is the version of the response format and the
16 # subsequent three bytes indicate the length (big endian) of the
17 # response string that follows. See the formatters for the layout
18 # of the response string, later on in this source code.
19 #
20 # Dependencies: pip/pip3 install nvidia-ml-py
21 #
22 # This program can be executed by python2 or python3 (just change
23 # the first line of this source file).
24 # --------------------------------------------------------------
25 # Author: Gerlof Langeveld
26 # Date: July 2018 (initial)
27 #
28 # Copyright (C) 2018 Gerlof Langeveld
29 #
30 # This program is free software; you can redistribute it and/or modify it
31 # under the terms of the GNU General Public License as published by the
32 # Free Software Foundation; either version 2, or (at your option) any
33 # later version.
34 #
35 # This program is distributed in the hope that it will be useful, but
36 # WITHOUT ANY WARRANTY; without even the implied warranty of
37 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38 # See the GNU General Public License for more details.
39 #
40 # You should have received a copy of the GNU General Public License
41 # along with this program; if not, write to the Free Software
42 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
43 # ==============================================================
44
45 import os
46 import sys
47 import time
48 import socket
49 import struct
50 import logging
51 import logging.handlers as loghand
52 import threading
53
54 GPUDPORT = 59123 # TCP port number server
55
56 COMPUTE = 1 # task support bit value
57 ACCOUNT = 2 # task support bit value
58
59 # =================================
60 # GPU related bookkeeping
61 # =================================
62 gpulist = [] # list with per-GPU bookkeeping
63 cliterm = {} # dict with one entry per client (client
64 # socket as key), that contains a dict with
65 # the terminated per-process bookkeepings
66 # that still have to be received by this client
67 # (pid of terminated process as key)
68
69 gpulock = threading.Lock() # mutex for access to gpulist/cliterm
70
71
72 # =================================
73 # per-GPU class
74 # =================================
75 class Stats(object): pass # generic statistics container
76
77 class GpuProp(object):
78 ###############################
79 # initialization method to setup
80 # properties
81 ###############################
82 def __init__(self, num):
83 gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(num)
84 pciInfo = pynvml.nvmlDeviceGetPciInfo(gpuhandle)
85
86 self.gpuhandle = gpuhandle
87 self.stats = Stats()
88
89 self.stats.busid = pciInfo.busId.decode('ascii', errors='replace')
90 self.stats.devname = pynvml.nvmlDeviceGetName(gpuhandle).decode(
91 'ascii', errors='replace').replace(' ', '_')
92
93 self.stats.tasksupport = 0 # process stats support
94
95 try:
96 procinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(gpuhandle)
97 self.stats.tasksupport |= COMPUTE # compute support
98 except Exception:
99 pass # no compute support
100
101 try:
102 pynvml.nvmlDeviceSetAccountingMode(gpuhandle, True)
103 pynvml.nvmlDeviceSetPersistenceMode(gpuhandle, True) # NVIDIA advise
104 self.stats.tasksupport |= ACCOUNT # account support
105 except Exception as e:
106 pass
107
108 self.stats.gpupercnow = 0 # perc of time that GPU was busy
109 self.stats.mempercnow = 0 # perc of time that memory was rd/wr
110
111 self.stats.memtotalnow = 0 # in Kb
112 self.stats.memusednow = 0 # in Kb
113
114 self.stats.gpusamples = 0
115 self.stats.gpuperccum = 0 # perc of time that GPU was busy
116 self.stats.memperccum = 0 # perc of time that memory was rd/wr
117 self.stats.memusedcum = 0 # in KiB
118
119 self.stats.procstats = {} # stats of active processes (key = pid)
120
121 ###############################
122 # method to fetch counters and values
123 ###############################
124 def readstats(self):
125 self.stats.gpusamples += 1
126
127 # -----------------------------
128 # get rates (utilization percentages)
129 # -----------------------------
130 try:
131 rates = pynvml.nvmlDeviceGetUtilizationRates(self.gpuhandle)
132
133 self.stats.gpupercnow = rates.gpu
134 self.stats.mempercnow = rates.memory
135 self.stats.gpuperccum += rates.gpu
136 self.stats.memperccum += rates.memory
137 except pynvml.NVMLError as err:
138 self.stats.gpupercnow = -1
139 self.stats.mempercnow = -1
140 self.stats.gpuperccum = -1
141 self.stats.memperccum = -1
142
143 # -----------------------------
144 # get memory occupation GPU-wide
145 # -----------------------------
146 try:
147 meminfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpuhandle)
148
149 self.stats.memtotalnow = meminfo.total // 1024
150 self.stats.memusednow = meminfo.used // 1024
151 self.stats.memusedcum += meminfo.used // 1024 # in KiB
152 except pynvml.NVMLError as err:
153 pass
154
155 # -----------------------------
156 # get per-process statistics
157 # -----------------------------
158 try:
159 procinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(
160 self.gpuhandle)
161
162 # -------------------------
163 # build list with pids from
164 # the previous interval
165 # -------------------------
166 actprocs = list(self.stats.procstats.keys())
167
168 # -------------------------
169 # handle proc stats of this
170 # interval
171 # -------------------------
172 for proc in procinfo:
173 pid = proc.pid
174
175 # ---------------------
176 # new process?
177 # create new stats
178 # ---------------------
179 if pid not in actprocs:
180 self.stats.procstats[pid] = Stats()
181
182 self.stats.procstats[pid].memnow = 0 # in KiB
183 self.stats.procstats[pid].memcum = 0 # in KiB
184 self.stats.procstats[pid].sample = 0
185
186 self.stats.procstats[pid].gpubusy = -1
187 self.stats.procstats[pid].membusy = -1
188 self.stats.procstats[pid].timems = -1
189 else:
190 actprocs.remove(pid)
191
192 # ---------------------
193 # maintain proc stats
194 # ---------------------
195 if proc.usedGpuMemory:
196 self.stats.procstats[pid].memnow = proc.usedGpuMemory//1024
197 self.stats.procstats[pid].memcum += proc.usedGpuMemory//1024
198 self.stats.procstats[pid].sample += 1
199
200 if self.stats.tasksupport & ACCOUNT:
201 try:
202 stats = pynvml.nvmlDeviceGetAccountingStats(self.gpuhandle, pid)
203
204 self.stats.procstats[pid].gpubusy = stats.gpuUtilization
205 self.stats.procstats[pid].membusy = stats.memoryUtilization
206 self.stats.procstats[pid].timems = stats.time
207 except Exception:
208 pass
209
210 # -------------------------
211 # determine which processes
212 # have terminated since
213 # previous sample
214 # -------------------------
215 for pid in actprocs:
216 for client in cliterm:
217 cliterm[client][pid] = self.stats.procstats[pid]
218
219 del self.stats.procstats[pid]
220
221 except pynvml.NVMLError as err:
222 pass
223
224
225 ###############################
226 # obtain current statistics
227 ###############################
228 def getstats(self):
229 return self.stats
230
231
232 # =================================
233 # Main function
234 # =================================
235 def main():
236 # -----------------------------
237 # initialize GPU access,
238 # specifically to detect of it
239 # succeeds
240 # -----------------------------
241 try:
242 pynvml.nvmlInit()
243 except Exception:
244 logging.error("Shared lib 'libnvidia-ml' probably not installed!")
245 sys.exit()
246
247 # -----------------------------
248 # open IPv6 stream socket
249 # -----------------------------
250 try:
251 mysock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM, 0)
252 except Exception as sockex:
253 try:
254 mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0)
255 except Exception as sockex:
256 logging.error("Socket creation fails")
257 sys.exit(1)
258
259 # -----------------------------
260 # bind to local port and
261 # make socket passive
262 # -----------------------------
263 try:
264 mysock.bind( ("", GPUDPORT) )
265 mysock.listen(32)
266 except Exception as sockex:
267 logging.error("Socket binding to port %d fails", GPUDPORT)
268 sys.exit(1)
269
270 # -----------------------------
271 # release parent process
272 # (daemonize)
273 # -----------------------------
274 try:
275 if os.fork():
276 sys.exit(0) # parent process exits; child continues...
277 except Exception:
278 logging.error("Failed to fork child")
279
280 # -----------------------------
281 # initialize GPU access for the
282 # child process
283 # -----------------------------
284 try:
285 pynvml.nvmlInit()
286 except Exception:
287 pass
288
289 # -----------------------------
290 # determine number of GPUs in
291 # this system
292 # -----------------------------
293 gpunum = pynvml.nvmlDeviceGetCount()
294 logging.info("Number of GPUs: %d", gpunum)
295
296 if gpunum == 0:
297 logging.info("Terminated (no GPUs available)")
298 sys.exit()
299
300 # -----------------------------
301 # initialize per-GPU bookkeeping
302 # -----------------------------
303 for i in range(gpunum):
304 gpulist.append( GpuProp(i) )
305
306 # -----------------------------
307 # kick off new thread to fetch
308 # statistics periodically
309 # -----------------------------
310 t = threading.Thread(target=gpuscanner, args=(1,))
311 t.daemon = True
312 t.start()
313
314 logging.info("Initialization succeeded")
315
316 # -----------------------------
317 # main thread:
318 # await connect of client
319 # -----------------------------
320 while True:
321 newsock, peeraddr = mysock.accept()
322
323 # -------------------------
324 # create new thread to
325 # serve this client
326 # -------------------------
327 t = threading.Thread(target=serveclient, args=(newsock, peeraddr))
328 t.daemon = True
329 t.start()
330
331
332 # ===========================================
333 # Thread start function:
334 # Serve new client that has just
335 # connected.
336 #
337 # -------------------------------------------
338 # Protocol between client and server:
339 #
340 # - client transmits request
341 # consisting of two bytes
342 #
343 # byte 0: type of request
344 # 'S' get statistical counters
345 # 'T' get type of each GPU
346 #
347 # byte 1: integer version number
348 # response layout might change
349 # so the client asks for a
350 # specific response version
351 #
352 # - server transmits response
353 # consisting of a four bytes integer
354 # in big endian byte order
355 #
356 # byte 0: version number, preferably
357 # as requested by the client
358 #
359 # byte 1-3: length of the response string
360 # that follows
361 #
362 # followed by the response string that is
363 # version specific (see gpuformatters)
364 # ===========================================
365 def serveclient(sock, peer):
366 # -----------------------------
367 # create per client bookkeeping
368 # for terminated processes
369 # -----------------------------
370 with gpulock:
371 cliterm[sock] = {}
372
373 # -----------------------------
374 # main loop
375 # -----------------------------
376 while True:
377 # -------------------------
378 # wait for request
379 # -------------------------
380 try:
381 rcvbuf = sock.recv(20)
382 except Exception as sockex:
383 logging.error("Receive error: %s", sockex)
384 sock.close()
385 break
386
387 # -------------------------
388 # connection closed by peer?
389 # -------------------------
390 if not rcvbuf:
391 sock.close()
392 break
393
394 logging.debug("Received: %s", rcvbuf)
395
396 # -------------------------
397 # request has wrong length?
398 # -------------------------
399 if len(rcvbuf) != 2:
400 logging.error('Wrong request length: %d', len(rcvbuf))
401 sock.close()
402 break
403
404 # -------------------------
405 # valid request:
406 # get statistical counters?
407 # -------------------------
408 try:
409 command = chr(rcvbuf[0]) # Python3
410 version = rcvbuf[1]
411 except Exception:
412 command = rcvbuf[0] # Python2
413 version = ord(rcvbuf[1])
414
415 if command == 'S':
416 if version == 0 or version >= len(gpuformatters):
417 version = len(gpuformatters)-1
418
419 xmitbuf = gpuformatters[version](sock).encode('ascii',
420 errors='replace')
421
422 # -------------------------
423 # valid request:
424 # get GPU types?
425 # -------------------------
426 elif command == 'T':
427 if version == 0 or version >= len(gpudevnames):
428 version = len(gpudevnames)-1
429
430 xmitbuf = gpudevnames[version]().encode('ascii', errors='replace')
431
432 # -------------------------
433 # invalid request!
434 # -------------------------
435 else:
436 logging.error('Wrong request from client: %s', command)
437 sock.close()
438 break
439
440 # -------------------------
441 # transmit GPU statistics
442 # as bytes
443 # -------------------------
444 logging.debug("Send: %s", xmitbuf)
445
446 prelude = struct.pack(">I", (version << 24) + len(xmitbuf))
447
448 try:
449 sock.send(prelude)
450 sock.send(xmitbuf)
451 except Exception as sockex:
452 logging.error("Send error: %s", sockex)
453 sock.close()
454 break
455
456 # -----------------------------
457 # delete per client bookkeeping
458 # of terminated processes
459 # -----------------------------
460 with gpulock:
461 del cliterm[sock]
462
463 # -----------------------------
464 # END OF CLIENT THREAD
465 # -----------------------------
466
467
468 # =================================
469 # Generate sequence of device names
470 # =================================
471 def gpudevname_v1():
472 # -----------------------------
473 # main loop:
474 # - get device name of every GPU
475 # - convert into one string
476 # with format:
477 # numgpus@busid devname tasksupport@busid devname tasksupport@...
478 # -----------------------------
479 strbuf = str( len(gpulist) )
480
481 with gpulock:
482 for i, gpu in enumerate(gpulist):
483 s = gpu.getstats()
484 strbuf += "@{:s} {:s} {:d}".format(
485 s.busid, s.devname, s.tasksupport)
486
487 return strbuf
488
489 gpudevnames = [None, gpudevname_v1]
490
491
492 # =================================
493 # Convert statistics of all GPUs
494 # into parseable string
495 # =================================
496 def gpuformatter_v1(clisock):
497 # -----------------------------
498 # main loop:
499 # - get statistics for every GPU
500 # - convert stats to one string
501 # with format:
502 # numgpus@gpu0 stats#pid stats#pid stats@gpu1 stats#pid stats@....
503 # -----------------------------
504 strbuf = ""
505
506 with gpulock:
507 for i, gpu in enumerate(gpulist):
508 s = gpu.getstats()
509
510 # ---------------------
511 # generic GPU stats
512 # ---------------------
513 strbuf += "@{:d} {:d} {:d} {:d} {:d} {:d} {:d} {:d}".format(
514 s.gpupercnow, s.mempercnow,
515 s.memtotalnow, s.memusednow, s.gpusamples,
516 s.gpuperccum, s.memperccum, s.memusedcum);
517
518 # ---------------------
519 # active processes for
520 # this GPU
521 # ---------------------
522 for pid, stat in s.procstats.items():
523 strbuf += "#A {:d} {:d} {:d} {:d} {:d} {:d} {:d}".format(pid,
524 stat.gpubusy, stat.membusy, stat.timems,
525 stat.memnow, stat.memcum, stat.sample)
526
527 # ---------------------
528 # terminated processes
529 # for this GPU
530 # ---------------------
531 for pid, stat in cliterm[clisock].items():
532 strbuf += "#E {:d} {:d} {:d} {:d} {:d} {:d} {:d}".format(pid,
533 stat.gpubusy, stat.membusy, stat.timems,
534 stat.memnow, stat.memcum, stat.sample)
535
536 cliterm[clisock].clear()
537
538
539 return strbuf
540
541
542 gpuformatters = [None, gpuformatter_v1]
543
544
545 # =================================
546 # Thread start function:
547 # Scan all GPUs with a particular
548 # interval to obtain their stats
549 # =================================
550 def gpuscanner(scaninterval):
551 # -----------------------------
552 # main loop:
553 # - get statistics for every GPU
554 # - sleep for interval
555 # -----------------------------
556 while True:
557 with gpulock:
558 for gpu in gpulist:
559 gpu.readstats()
560
561 time.sleep(scaninterval)
562
563 # ==========================================================================
564
565 # -----------------------------
566 # initialize logging
567 # -----------------------------
568 if '-v' in sys.argv:
569 loglevel = logging.DEBUG
570 else:
571 loglevel = logging.INFO
572
573 fm = logging.Formatter('atopgpud %(levelname)s: %(message)s')
574
575 fh = loghand.SysLogHandler('/dev/log',
576 facility=loghand.SysLogHandler.LOG_DAEMON)
577 fh.setFormatter(fm)
578 fh.setLevel(loglevel)
579
580 lg = logging.getLogger() # root logger
581 lg.addHandler(fh)
582 lg.setLevel(loglevel)
583
584 # -----------------------------
585 # load module pynvml
586 # -----------------------------
587 try:
588 import pynvml
589 except Exception:
590 logging.error("Python module 'pynvml' not installed!")
591 sys.exit(1)
592
593
594 try:
595 # -----------------------------
596 # call main function
597 # -----------------------------
598 main()
599
600 finally:
601 # -----------------------------
602 # shutdown GPU access
603 # -----------------------------
604 try:
605 pynvml.nvmlShutdown()
606 except Exception:
607 pass