#python #grouping #namedtuple
#python #группировка #namedtuple
Вопрос:
В python у меня есть следующие данные в списке namedtuple в памяти:
from collections import namedtuple
perfvalues=[]
perfitem = namedtuple('perfitem', 'cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write')
item1=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk1', 650000, 500000, 1.2, 0.9)
item2=perfitem('cluster1', 'host1', 'database1', 'dg_data1', 'disk2', 630000, 480000, 1.1, 1)
item3=perfitem('cluster1', 'host1', 'database1', 'dg_data2', 'disk1', 730000, 250000, 0.4, 0.7)
item4=perfitem('cluster1', 'host1', 'database2', 'dg_data1', 'disk1', 320000, 400000, 1, 0.4)
item5=perfitem('cluster1', 'host1', 'database2', 'dg_data2', 'disk1', 550000, 300000, 0.8, 0.8)
item6=perfitem('cluster1', 'host2', 'database3', 'dg_data1', 'disk1', 420000, 310000, 1.2, 0.7)
item7=perfitem('cluster1', 'host2', 'database3', 'dg_data2', 'disk1', 880000, 280000, 0.4, 0.6)
item8=perfitem('cluster1', 'host2', 'database4', 'dg_data5', 'disk1', 440000, 600000, 1, 0.5)
item9=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk1', 490000, 450000, 1, 0.4)
item10=perfitem('cluster1', 'host2', 'database4', 'dg_data2', 'disk2', 410000, 450000, 1, 0.4)
perfvalues.append(item1)
perfvalues.append(item2)
perfvalues.append(item3)
perfvalues.append(item4)
perfvalues.append(item5)
perfvalues.append(item6)
perfvalues.append(item7)
perfvalues.append(item8)
perfvalues.append(item9)
print(perfvalues)
Я хочу сгруппировать данные по :
- кластер
- кластер и хост
- кластер, хост и база данных
- кластер и хост, база данных и дисковая группа
Мне не понадобятся данные о диске.
В каждой группе я хочу :
- суммируйте значения read_bytes_per_sec и write_bytes_per_sec
- вычислите среднее значение значений avg_ms_per_read и avg_ms_per_write
В результате я хочу получить список, содержащий :
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database1', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host1', db='database2', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database3', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data5', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_diskgroup', cluster='cluster1', host='host2', db='database4', dg='dg_data2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host1', db='database2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database3', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_database', cluster='cluster1', host='host2', db='database4', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_host', cluster='cluster1', host='host2', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
group='per_cluster', cluster='cluster1', read_bytes_per_sec=sum, write_bytes_per_sec=sum, avg_ms_per_read=avg, avg_ms_per_write=avg
результатом может быть список dict или список именованных корней, я думаю.
порядок в окончательном списке не имеет значения.
Я использую python 3.8.
Есть идеи? Спасибо
Ответ №1:
Это фреймы данных pandas, если вам нужна версия словаря python, вы получите ее, добавив .to_dict()
в per_diskgroup
, per_database
, per_host
, per_cluster
. например: per_database.to_dict()
In [2]: l = ["cluster1,host1,database1,dg_data1,disk1,650000,500000,1.2,0.9",
...: "cluster1,host1,database1,dg_data1,disk2,630000,480000,1.1,1",
...: "cluster1,host1,database1,dg_data2,disk1,730000,250000,0.4,0.7",
...: "cluster1,host1,database2,dg_data1,disk1,320000,400000,1,0.4",
...: "cluster1,host1,database2,dg_data2,disk1,550000,300000,0.8,0.8",
...: "cluster1,host2,database3,dg_data1,disk1,420000,310000,1.2,0.7",
...: "cluster1,host2,database3,dg_data2,disk1,880000,280000,0.4,0.6",
...: "cluster1,host2,database4,dg_data5,disk1,440000,600000,1,0.5",
...: "cluster1,host2,database4,dg_data2,disk1,490000,450000,1,0.4",
...: "cluster1,host2,database4,dg_data2,disk2,410000,450000,1,0.4"]
...: l = [i.split(",") for i in l]
...: df = pd.DataFrame(l, columns="cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
...: ".split())
...: cols = ['read_bytes_per_sec', 'write_bytes_per_sec', 'avg_ms_per_read','avg_ms_per_write']
...: df[cols] = df[cols].astype(float)
...: df
Out[2]:
cluster host database diskgroup disk read_bytes_per_sec write_bytes_per_sec avg_ms_per_read avg_ms_per_write
0 cluster1 host1 database1 dg_data1 disk1 650000.0 500000.0 1.2 0.9
1 cluster1 host1 database1 dg_data1 disk2 630000.0 480000.0 1.1 1.0
2 cluster1 host1 database1 dg_data2 disk1 730000.0 250000.0 0.4 0.7
3 cluster1 host1 database2 dg_data1 disk1 320000.0 400000.0 1.0 0.4
4 cluster1 host1 database2 dg_data2 disk1 550000.0 300000.0 0.8 0.8
5 cluster1 host2 database3 dg_data1 disk1 420000.0 310000.0 1.2 0.7
6 cluster1 host2 database3 dg_data2 disk1 880000.0 280000.0 0.4 0.6
7 cluster1 host2 database4 dg_data5 disk1 440000.0 600000.0 1.0 0.5
8 cluster1 host2 database4 dg_data2 disk1 490000.0 450000.0 1.0 0.4
9 cluster1 host2 database4 dg_data2 disk2 410000.0 450000.0 1.0 0.4
In [3]: aggregation = {"read_bytes_per_sec" : sum, "avg_ms_per_read" : np.mean, "avg_ms_per_write" : np.mean}
...: per_diskgroup = df.groupby(['disk', 'cluster', 'host', 'database', 'diskgroup']).agg(aggregation)
...: per_database = df.groupby (['cluster', 'host', 'database']).agg(aggregation)
...: per_host = df.groupby (['host', 'cluster']).agg(aggregation)
...: per_cluster = df.groupby (['cluster']).agg(aggregation)
In [4]: per_diskgroup
Out[4]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
disk cluster host database diskgroup
disk1 cluster1 host1 database1 dg_data1 650000.0 1.2 0.9
dg_data2 730000.0 0.4 0.7
database2 dg_data1 320000.0 1.0 0.4
dg_data2 550000.0 0.8 0.8
host2 database3 dg_data1 420000.0 1.2 0.7
dg_data2 880000.0 0.4 0.6
database4 dg_data2 490000.0 1.0 0.4
dg_data5 440000.0 1.0 0.5
disk2 cluster1 host1 database1 dg_data1 630000.0 1.1 1.0
host2 database4 dg_data2 410000.0 1.0 0.4
In [5]: per_database
Out[5]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
cluster host database
cluster1 host1 database1 2010000.0 0.9 0.866667
database2 870000.0 0.9 0.600000
host2 database3 1300000.0 0.8 0.650000
database4 1340000.0 1.0 0.433333
In [6]: per_host
Out[6]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
host cluster
host1 cluster1 2880000.0 0.90 0.76
host2 cluster1 2640000.0 0.92 0.52
In [7]: per_cluster
Out[7]:
read_bytes_per_sec avg_ms_per_read avg_ms_per_write
cluster
cluster1 5520000.0 0.91 0.64