Как группировать данные из списка именованных корней

#python #grouping #namedtuple

#python #группировка #namedtuple

Вопрос:

В python у меня есть следующие данные в списке namedtuple в памяти:

 from collections import namedtuple
perfvalues=[]
perfitem = namedtuple('perfitem', 'cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write')
item1=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk1', 650000, 500000, 1.2, 0.9)
item2=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk2', 630000, 480000, 1.1, 1)
item3=perfitem('cluster1', 'host1', 'database1', 'dg_data2', 'disk1', 730000, 250000, 0.4, 0.7)
item4=perfitem('cluster1', 'host1', 'database2', 'dg_data1', 'disk1', 320000, 400000, 1, 0.4)
item5=perfitem('cluster1', 'host1', 'database2', 'dg_data2', 'disk1', 550000, 300000, 0.8, 0.8)
item6=perfitem('cluster1', 'host2', 'database3', 'dg_data1', 'disk1', 420000, 310000, 1.2, 0.7)
item7=perfitem('cluster1', 'host2', 'database3', 'dg_data2', 'disk1', 880000, 280000, 0.4, 0.6)
item8=perfitem('cluster1', 'host2', 'database4', 'dg_data5', 'disk1', 440000, 600000, 1, 0.5)
item9=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk1', 490000, 450000, 1, 0.4)
item10=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk2', 410000, 450000, 1, 0.4)
perfvalues.append(item1)
perfvalues.append(item2)
perfvalues.append(item3)
perfvalues.append(item4)
perfvalues.append(item5)
perfvalues.append(item6)
perfvalues.append(item7)
perfvalues.append(item8)
perfvalues.append(item9)
print(perfvalues)
 

Я хочу сгруппировать данные по :

  • кластер
  • кластер и хост
  • кластер, хост и база данных
  • кластер и хост, база данных и дисковая группа

Мне не понадобятся данные о диске.

В каждой группе я хочу :

  • суммируйте значения read_bytes_per_sec и write_bytes_per_sec
  • вычислите среднее значение значений avg_ms_per_read и avg_ms_per_write

В результате я хочу получить список, содержащий :

 group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data5', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database3', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database4', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg 
group='per_host', cluster='cluster1', host='host1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_cluster', cluster='cluster1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
 

результатом может быть список dict или список именованных корней, я думаю.
порядок в окончательном списке не имеет значения.
Я использую python 3.8.

Есть идеи? Спасибо

Ответ №1:

Это фреймы данных pandas, если вам нужна версия словаря python, вы получите ее, добавив .to_dict() в per_diskgroup , per_database , per_host , per_cluster . например: per_database.to_dict()

 In [2]: l = ["cluster1,host1,database1,dg_data1,disk1,650000,500000,1.2,0.9",
   ...: "cluster1,host1,database1,dg_data1,disk2,630000,480000,1.1,1",
   ...: "cluster1,host1,database1,dg_data2,disk1,730000,250000,0.4,0.7",
   ...: "cluster1,host1,database2,dg_data1,disk1,320000,400000,1,0.4",
   ...: "cluster1,host1,database2,dg_data2,disk1,550000,300000,0.8,0.8",
   ...: "cluster1,host2,database3,dg_data1,disk1,420000,310000,1.2,0.7",
   ...: "cluster1,host2,database3,dg_data2,disk1,880000,280000,0.4,0.6",
   ...: "cluster1,host2,database4,dg_data5,disk1,440000,600000,1,0.5",
   ...: "cluster1,host2,database4,dg_data2,disk1,490000,450000,1,0.4",
   ...: "cluster1,host2,database4,dg_data2,disk2,410000,450000,1,0.4"]
   ...: l = [i.split(",") for i in l]
   ...: df = pd.DataFrame(l, columns="cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
   ...: ".split())
   ...: cols = ['read_bytes_per_sec', 'write_bytes_per_sec', 'avg_ms_per_read','avg_ms_per_write']
   ...: df[cols] = df[cols].astype(float)
   ...: df
Out[2]: 
    cluster   host   database diskgroup   disk  read_bytes_per_sec  write_bytes_per_sec  avg_ms_per_read  avg_ms_per_write
0  cluster1  host1  database1  dg_data1  disk1            650000.0             500000.0              1.2               0.9
1  cluster1  host1  database1  dg_data1  disk2            630000.0             480000.0              1.1               1.0
2  cluster1  host1  database1  dg_data2  disk1            730000.0             250000.0              0.4               0.7
3  cluster1  host1  database2  dg_data1  disk1            320000.0             400000.0              1.0               0.4
4  cluster1  host1  database2  dg_data2  disk1            550000.0             300000.0              0.8               0.8
5  cluster1  host2  database3  dg_data1  disk1            420000.0             310000.0              1.2               0.7
6  cluster1  host2  database3  dg_data2  disk1            880000.0             280000.0              0.4               0.6
7  cluster1  host2  database4  dg_data5  disk1            440000.0             600000.0              1.0               0.5
8  cluster1  host2  database4  dg_data2  disk1            490000.0             450000.0              1.0               0.4
9  cluster1  host2  database4  dg_data2  disk2            410000.0             450000.0              1.0               0.4

In [3]: aggregation = {"read_bytes_per_sec" : sum, "avg_ms_per_read" : np.mean, "avg_ms_per_write" : np.mean}
   ...: per_diskgroup = df.groupby(['disk', 'cluster', 'host', 'database', 'diskgroup']).agg(aggregation)
   ...: per_database = df.groupby (['cluster', 'host', 'database']).agg(aggregation)
   ...: per_host = df.groupby     (['host', 'cluster']).agg(aggregation)
   ...: per_cluster = df.groupby  (['cluster']).agg(aggregation)

In [4]: per_diskgroup
Out[4]: 
                                          read_bytes_per_sec  avg_ms_per_read  avg_ms_per_write
disk  cluster  host  database  diskgroup                                                       
disk1 cluster1 host1 database1 dg_data1             650000.0              1.2               0.9
                               dg_data2             730000.0              0.4               0.7
                     database2 dg_data1             320000.0              1.0               0.4
                               dg_data2             550000.0              0.8               0.8
               host2 database3 dg_data1             420000.0              1.2               0.7
                               dg_data2             880000.0              0.4               0.6
                     database4 dg_data2             490000.0              1.0               0.4
                               dg_data5             440000.0              1.0               0.5
disk2 cluster1 host1 database1 dg_data1             630000.0              1.1               1.0
               host2 database4 dg_data2             410000.0              1.0               0.4

In [5]: per_database
Out[5]: 
                          read_bytes_per_sec  avg_ms_per_read  avg_ms_per_write
cluster  host  database                                                        
cluster1 host1 database1           2010000.0              0.9          0.866667
               database2            870000.0              0.9          0.600000
         host2 database3           1300000.0              0.8          0.650000
               database4           1340000.0              1.0          0.433333

In [6]: per_host
Out[6]: 
                read_bytes_per_sec  avg_ms_per_read  avg_ms_per_write
host  cluster                                                        
host1 cluster1           2880000.0             0.90              0.76
host2 cluster1           2640000.0             0.92              0.52

In [7]: per_cluster
Out[7]: 
          read_bytes_per_sec  avg_ms_per_read  avg_ms_per_write
cluster                                                        
cluster1           5520000.0             0.91              0.64