#python #list
Вопрос:
Я провожу некоторые тесты, чтобы проверить, лучше ли некоторые варианты из моего алгоритма выборки меняют свои значения.
Как я делал их(до этого момента без сучка и задоринки) и попытался запустить еще пару тестов для получения дополнительных результатов, и я получил эту ошибку:
ValueError Traceback (most recent call last)
<ipython-input-65-41ecc2b0a0ed> in <module>
22 for g in range(0,10000):
23 # sample
---> 24 sample_df = stratified_sample(df,test,size=38, keep_index=False)
25 pathaux = "C://Users//Pedro//Desktop//EscolhasAlgoritmos//Stratified//Stratified_Tests//"
26 example = "exampleFCUL"
<ipython-input-10-7aba847839db> in stratified_sample(df, strata, size, seed, keep_index)
79 # final dataframe
80 if first:
---> 81 stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
82 first = False
83 else:
D:Anacondalibsite-packagespandascoreframe.py in query(self, expr, inplace, **kwargs)
3182 kwargs["level"] = kwargs.pop("level", 0) 1
3183 kwargs["target"] = None
-> 3184 res = self.eval(expr, **kwargs)
3185
3186 try:
D:Anacondalibsite-packagespandascoreframe.py in eval(self, expr, inplace, **kwargs)
3298 kwargs["target"] = self
3299 kwargs["resolvers"] = kwargs.get("resolvers", ()) tuple(resolvers)
-> 3300 return _eval(expr, inplace=inplace, **kwargs)
3301
3302 def select_dtypes(self, include=None, exclude=None):
D:Anacondalibsite-packagespandascorecomputationeval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
325 eng = _engines[engine]
326 eng_inst = eng(parsed_expr)
--> 327 ret = eng_inst.evaluate()
328
329 if parsed_expr.assigner is None:
D:Anacondalibsite-packagespandascorecomputationengines.py in evaluate(self)
68
69 # make sure no names in resolvers and locals/globals clash
---> 70 res = self._evaluate()
71 return _reconstruct_object(
72 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
D:Anacondalibsite-packagespandascorecomputationengines.py in _evaluate(self)
117 truediv = scope["truediv"]
118 _check_ne_builtin_clash(self.expr)
--> 119 return ne.evaluate(s, local_dict=scope, truediv=truediv)
120 except KeyError as e:
121 # python 3 compat kludge
D:Anacondalibsite-packagesnumexprnecompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
820 # Create a signature
821 signature = [(name, getType(arg)) for (name, arg) in
--> 822 zip(names, arguments)]
823
824 # Look up numexpr if possible.
D:Anacondalibsite-packagesnumexprnecompiler.py in <listcomp>(.0)
819
820 # Create a signature
--> 821 signature = [(name, getType(arg)) for (name, arg) in
822 zip(names, arguments)]
823
D:Anacondalibsite-packagesnumexprnecompiler.py in getType(a)
701 if kind == 'S':
702 return bytes
--> 703 raise ValueError("unknown type %s" % a.dtype.name)
704
705
ValueError: unknown type object`
Когда я пытался понять/выполнить поиск в stackoverflow и в Google, что можно сделать в моем коде(показано ниже), чтобы устранить проблему, я ничего не смог найти об ошибке и какие решения были приняты для ее устранения.
Да, я знаю, что для python это неизвестный тип, но это список, в который я вхожу, как это всегда было, и раньше он работал со списками без каких-либо сбоев, поэтому я не могу понять показанную ошибку.
Код:
def transform(multilevelDict):
return {"t" '_' str(key) : (transform(value) if isinstance(value, dict) else value) for key, value in multilevelDict.items()}
df = pd.read_csv('testingwebsitedata6.csv', sep=';')
df['Element_Count'] = df['Element_Count'].apply((json.loads))
df['Tag_Count'] = df['Tag_Count'].apply((json.loads))
for i in range(len(df['Tag_Count'])):
df['Tag_Count'][i] = transform(df['Tag_Count'][i])
df1 = pd.DataFrame(df['Element_Count'].values.tolist())
df2 = pd.DataFrame(df['Tag_Count'].values.tolist())
df = pd.concat([df.drop('Element_Count', axis=1), df1], axis=1)
df= pd.concat([df.drop('Tag_Count', axis=1), df2], axis=1)
df= df.fillna(0)
df[df.select_dtypes(include=['float64']).columns]= df.select_dtypes(include=['float64']).astype(int)
df
test= ['link', 'document', 'heading', 'form', 'textbox', 'button', 'list', 'listitem', 'img', 'navigation', 'banner', 'main', 'article', 'contentinfo', 'checkbox', 'table', 'rowgroup', 'row', 'cell', 'listbox', 'presentation', 'figure', 'columnheader', 'separator', 'group', 'region']
print('test1')
print('n')
for g in range(0,10000):
# sample
sample_df = stratified_sample(df,test,size=38, keep_index=False)
pathaux = "C://Users//Pedro//Desktop//EscolhasAlgoritmos//Stratified//Stratified_Tests//"
example = "exampleFCUL"
randomnumber = g 1
csv = ".csv"
path = pathaux '26' '//' example str(randomnumber) csv
chosencolumns= ["Uri"]
sample_df.to_csv(path,sep=';', index = False, columns =chosencolumns, header = False)
Используемая функция стратифицированной выборки:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
'''
It samples data from a pandas dataframe using strata. These functions use
proportionate stratification:
n1 = (N1/N) * n
where:
- n1 is the sample size of stratum 1
- N1 is the population size of stratum 1
- N is the total population size
- n is the sampling size
Parameters
----------
:df: pandas dataframe from which data will be sampled.
:strata: list containing columns that will be used in the stratified sampling.
:size: sampling size. If not informed, a sampling size will be calculated
using Cochran adjusted sampling formula:
cochran_n = (Z**2 * p * q) /e**2
where:
- Z is the z-value. In this case we use 1.96 representing 95%
- p is the estimated proportion of the population which has an
attribute. In this case we use 0.5
- q is 1-p
- e is the margin of error
This formula is adjusted as follows:
adjusted_cochran = cochran_n / 1 ((cochran_n -1)/N)
where:
- cochran_n = result of the previous formula
- N is the population size
:seed: sampling seed
:keep_index: if True, it keeps a column with the original population index indicator
Returns
-------
A sampled pandas dataframe based in a set of strata.
Examples
--------
>> df.head()
id sex age city
0 123 M 20 XYZ
1 456 M 25 XYZ
2 789 M 21 YZX
3 987 F 40 ZXY
4 654 M 45 ZXY
...
# This returns a sample stratified by sex and city containing 30% of the size of
# the original data
>> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
Requirements
------------
- pandas
- numpy
'''
population = len(df)
size = __smpl_size(population, size)
tmp = df[strata]
tmp['size'] = 1
tmp_grpd = tmp.groupby(strata).count().reset_index()
tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
# controlling variable to create the dataframe or append to it
first = True
for i in range(len(tmp_grpd)):
# query generator for each iteration
qry=''
for s in range(len(strata)):
stratum = strata[s]
value = tmp_grpd.iloc[i][stratum]
n = tmp_grpd.iloc[i]['samp_size']
if type(value) == str:
value = "'" str(value) "'"
if s != len(strata)-1:
qry = qry stratum ' == ' str(value) ' amp; '
else:
qry = qry stratum ' == ' str(value)
# final dataframe
if first:
stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
first = False
else:
tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
stratified_df = stratified_df.append(tmp_df, ignore_index=True)
return stratified_df
def stratified_sample_report(df, strata, size=None):
'''
Generates a dataframe reporting the counts in each stratum and the counts
for the final sampled dataframe.
Parameters
----------
:df: pandas dataframe from which data will be sampled.
:strata: list containing columns that will be used in the stratified sampling.
:size: sampling size. If not informed, a sampling size will be calculated
using Cochran adjusted sampling formula:
cochran_n = (Z**2 * p * q) /e**2
where:
- Z is the z-value. In this case we use 1.96 representing 95%
- p is the estimated proportion of the population which has an
attribute. In this case we use 0.5
- q is 1-p
- e is the margin of error
This formula is adjusted as follows:
adjusted_cochran = cochran_n / 1 ((cochran_n -1)/N)
where:
- cochran_n = result of the previous formula
- N is the population size
Returns
-------
A dataframe reporting the counts in each stratum and the counts
for the final sampled dataframe.
'''
population = len(df)
size = __smpl_size(population, size)
tmp = df[strata]
tmp['size'] = 1
tmp_grpd = tmp.groupby(strata).count().reset_index()
tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
return tmp_grpd
def __smpl_size(population, size):
'''
A function to compute the sample size. If not informed, a sampling
size will be calculated using Cochran adjusted sampling formula:
cochran_n = (Z**2 * p * q) /e**2
where:
- Z is the z-value. In this case we use 1.96 representing 95%
- p is the estimated proportion of the population which has an
attribute. In this case we use 0.5
- q is 1-p
- e is the margin of error
This formula is adjusted as follows:
adjusted_cochran = cochran_n / 1 ((cochran_n -1)/N)
where:
- cochran_n = result of the previous formula
- N is the population size
Parameters
----------
:population: population size
:size: sample size (default = None)
Returns
-------
Calculated sample size to be used in the functions:
- stratified_sample
- stratified_sample_report
'''
if size is None:
cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
n = round(cochran_n/(1 ((cochran_n -1) /population)))
elif size >= 0 and size < 1:
n = round(population * size)
elif size < 0:
raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
elif size >= 1:
n = size
return n
Редактировать
Я проверил все типы всех переменных, прежде чем задавать этот вопрос, и они такие же, как и мои предыдущие тесты
Какие-нибудь сладости?
(Все, что я забыл упомянуть, что вы считаете важным для понимания проблемы, пожалуйста, скажите, и я отредактирую это в)
Комментарии:
1. обратная связь покажет вам, в чем проблема,
stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
. изучите переменные в этой строке, чтобы определить, являются ли они допустимыми входными данными для вызываемых вами функций2. P. S: Я отредактировал, что провел эти тесты, мой плохой @gold_cy за то, что не сказал этого