import os import io import sys import timeit import tokenize import torch import psutil import inspect from loguru import logger from prettytable import PrettyTable # implement by xtudbxk # github: https://github.com/xtudbxk/lineprofiler class MyLineProfiler(): def __init__(self, base='ms', cuda_sync=True, gpuids=(0,), warmup=0, warmup_lineno=-1): if base == 'ms': self.base_n = 1000 elif base == 's': self.base_n = 1 else: logguru.warning(f'Unsupported base - {base}, using "s" instead') self.base = base self.cuda_sync = cuda_sync self.gpuids = gpuids self.warmup = warmup self.warmup_counter = warmup # we should wait this line execute warup_counter times # before recording the stats self.warmup_lineno = warmup_lineno # for time profiling self._times = {} self._func_name = None self._func_filename = None self._last_time = -1 self._last_lineno = -1 self._func_hit_count = 0 self._func_firstlineno = 0 # for memory profiling self._process = psutil.Process(os.getpid()) self._memory = {} self._last_memory = 0 # for cuda memory profiling self._gpu_memory = {} self._gpu_last_memory = 0 def __trace_func__(self, frame, event, arg): # print(f'in {frame.f_code.co_filename} func {frame.f_code.co_name} line {frame.f_lineno}, event - {event}') # check if run into the decorated func if self._func_firstlineno == frame.f_code.co_firstlineno and frame.f_code.co_name == self._func_name and frame.f_code.co_filename == self._func_filename: # --- obtain info for current hit --- # cuda related if self.cuda_sync is True: torch.cuda.synchronize() current_time = timeit.default_timer() memory = self._process.memory_info().rss gpu_memory = torch.cuda.memory_allocated() # --- ends --- # --- initilize the info when first hit --- if frame.f_lineno not in self._times: # first hit time for this line self._times[frame.f_lineno] = {'hit':0, 'time': 0} self._memory[frame.f_lineno] = 0 self._gpu_memory[frame.f_lineno] = 0 # --- ends --- # --- record info before call the decorated func --- # 'call' - before call the func if event == 'call': self._last_time = current_time self._last_lineno = frame.f_lineno self._last_memory = memory self._last_gpu_memory = gpu_memory if self.warmup_lineno < 0: self.warmup_counter -= 1 if self.warmup_counter < 0: self._func_hit_count += 1 # --- ends --- # 'line' - after excuting the line # 'return' - return from the function if event == 'line' or event == 'return': if event == 'line' and self.warmup_counter < 0: self._times[frame.f_lineno]['hit'] += 1 # --- obtain the memory and time consumed by this line --- if self.warmup_counter < 0: self._times[self._last_lineno]['time'] += current_time - self._last_time self._memory[self._last_lineno] += memory - self._last_memory self._gpu_memory[self._last_lineno] += gpu_memory - self._gpu_last_memory # --- ends --- if self.cuda_sync is True: torch.cuda.synchronize() self._last_time = timeit.default_timer() self._last_memory = memory self._gpu_last_memory = gpu_memory self._last_lineno = frame.f_lineno return self.__trace_func__ def decorate(self, func): if self._func_name is not None: logger.warning(f'Only support decorate only one func. Aready decorated "{self._func_name}"') self._func_name = func.__name__ self._func_filename = func.__code__.co_filename self._func_firstlineno = func.__code__.co_firstlineno def _f(*args, **kwargs): origin_trace_func = sys.gettrace() sys.settrace(self.__trace_func__) ret = func(*args, **kwargs) sys.settrace(origin_trace_func) return ret return _f def _get_table(self): if len(self._times) <= 0: logger.warning(f"un recorded datas, please ensure the function is executed") return None # --- load the source code --- with open(self._func_filename, 'r') as f: source_lines = [line.strip('\n') for line in f.readlines()] code_str = "\n".join(source_lines) def_lineno = min(self._times.keys()) final_lineno = max(self._times.keys()) # remove the additional blank content pre_blank_count = len(source_lines[def_lineno-1]) - len(source_lines[def_lineno-1].lstrip(' ').lstrip('\t')) # --- ends --- # --- analysize the source code and collect infos for multi-line code --- new_logic_linenos = [token.start[0] for token in tokenize.generate_tokens( io.StringIO(code_str).readline) if token.type == 4] # --- ends --- # --- merge the stats multi-line code --- sorted_linenos = [lineno for lineno in self._times.keys()] sorted_linenos.sort(key=int) lineno_cache = [] for lineno in sorted_linenos: if lineno not in new_logic_linenos: lineno_cache.append(lineno) else: # we should merge its info to the prev_lineno if len(lineno_cache) <= 0: continue else: lineno_cache.append(lineno) first_lineno = lineno_cache[0] for prev_lineno in lineno_cache[1:]: self._times[first_lineno]["hit"] = min(self._times[first_lineno]["hit"], self._times[prev_lineno]["hit"]) self._times[first_lineno]["time"] += self._times[prev_lineno]["time"] del self._times[prev_lineno] self._memory[first_lineno] += self._memory[prev_lineno] del self._memory[prev_lineno] self._gpu_memory[first_lineno] += self._gpu_memory[prev_lineno] del self._gpu_memory[prev_lineno] lineno_cache = [] # --- ends --- # --- initialize the pretty table for output --- table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak']) # --- ends --- # --- compute some statisticals --- total_hit = 0 # for compute the hit percentage total_time = 0 for lineno, stats in self._times.items(): if lineno == def_lineno: continue total_hit += stats['hit'] total_time += stats['time'] total_memory = sum([m for l,m in self._memory.items()]) / 1024 / 1024 total_gpu_memory = sum([m for l,m in self._gpu_memory.items()]) / 1024 / 1024 # --- ends --- peak_cpu_memory = 0 peak_gpu_memory = 0 for lineno in range(def_lineno, final_lineno+1): if lineno not in self._times: # the comment line, empty line or merged line from multi-lines code table.add_row([lineno, '-', '-', '-', '-', '-', '-',f'{peak_cpu_memory:5.3f} MB', '-', f'{peak_gpu_memory:5.3f} MB']) else: stats = self._times[lineno] if lineno == def_lineno: table.add_row([lineno, self._func_hit_count, f'{total_time*self.base_n:.4f} {self.base}', f'{total_time/self._func_hit_count*self.base_n:.4f} {self.base}', '-', '-', f'{total_memory:5.3f} MB', 'baseline', f'{total_gpu_memory:5.3f} MB', 'baseline']) else: line_result = [lineno, stats['hit'], f'{stats["time"]*self.base_n:.4f} {self.base}', f'{stats["time"]/stats["hit"]*self.base_n:.4f} {self.base}' if stats['hit'] > 0 else 'nan', f'{stats["hit"]/total_hit*100:.3f}%' if total_hit > 0 else 'nan', f'{stats["time"]/total_time*100:.3f}%'] if total_time > 0 else 'nan' line_result += [f'{self._memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB'] peak_cpu_memory = peak_cpu_memory + self._memory[lineno]/1024/1024 line_result += [f'{peak_cpu_memory:5.3f} MB'] line_result += [f'{self._gpu_memory[lineno]/1024/1024:5.3f} MB' if stats['hit'] > 0 else '0 MB'] peak_gpu_memory = peak_gpu_memory + self._gpu_memory[lineno]/1024/1024 line_result += [f'{peak_gpu_memory:5.3f} MB'] table.add_row(line_result) table.add_column('sources', [source_lines[i-1][pre_blank_count:] if len(source_lines[i-1])>pre_blank_count else '' for i in range(def_lineno, final_lineno+1)], 'l') return table def print(self, filename=None, mode="w"): introducation = ''' 1. The first line of table reports the overall results of the whole function and the following lines reports the statistics of each line in the function. 2. The `hit perc` and `time perc` represent `hit percentage` and `time percentage`. 3. For memory, there exists four categories `mem inc`, `mem peak`, `gpu mem inc` and `gpu mem peak`. They denotes `cpu memory increasement`, `cpu memory peak`, `gpu memory increasement` and `gpu memory peak`. All the results are collected in the last run. The number in the increasement field denots the increasement of corresponding memory of each line (the first line is related to the whole function). Sometimes, the number of each line is far less of the number of the first line, which is valid since python may auto release the unused memory after the function execution. The number of each line in the peak filed is a simple sum of the numbers of above lines in the increasement field, which is used to demonstrate the possible maxinum memory usage in the function. 4. For any issue, please concact us via https://github.com/xtudbxk/lineprofiler or zhengqiang.zhang@hotmail.com ''' print(introducation) table = PrettyTable(['lineno', 'hits', 'time', 'time per hit', 'hit perc', 'time perc', 'mem inc', 'mem peak', 'gpu mem inc', 'gpu mem peak']) table = self._get_table() print(table) if filename is not None: with open(filename, mode) as f: f.write(introducation) f.write(f"args - base={self.base}, cuda_sync={self.cuda_sync}, gpuids={self.gpuids}, warmup={self.warmup}\n") f.write(str(table)) if __name__ == '__main__': import numpy as np def mytest(h='hello', xx="xx"): h = h + 'world' a = [] for _ in range(200): # a = np.zeros((1000, 1000), dtype=np.float32) a.append(np.zeros((1000, 1000), dtype=np.float32)) a.append( np.zeros((1000, 1000), dtype=np.float32)) # print(a[0,0]) print(h) profiler = MyLineProfiler(cuda_sync=False, warmup=2) mytest = profiler.decorate(mytest) for _ in range(5): mytest() profiler.print()