Ошибка индекса: индекс списка выходит за пределы диапазона, даже если список существует

#python #pandas #dataframe #transactions

Вопрос:

 Merchant Coordinates ISSUES HEREgt;   12043 --------------------------------------------------------------------------- IndexError Traceback (most recent call last) lt;ipython-input-74-3255d6529f5dgt; in lt;modulegt;  4   5 print(len(data.merchant_long_lat)) ----gt; 6 data["merchant_long_lat"]=[[coords.split(" ")[1], coords.split(" ")[0]] for coords in data.merchant_long_lat]  7 data.merchant_long_lat.head()  lt;ipython-input-74-3255d6529f5dgt; in lt;listcompgt;(.0)  4   5 print(len(data.merchant_long_lat)) ----gt; 6 data["merchant_long_lat"]=[[coords.split(" ")[1], coords.split(" ")[0]] for coords in data.merchant_long_lat]  7 data.merchant_long_lat.head()   IndexError: list index out of range  

Это проблема, с которой я в настоящее время сталкиваюсь

 #!/usr/bin/env python # coding: utf-8  # In[1]:   import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from haversine import haversine, Unit   # In[2]:   data = pd.read_excel('C:/Users/Intel/Downloads/ANZ.xlsx') data.head()   # In[3]:   pd.DataFrame({"Column": data.columns})   # In[4]:   print("Data shape", data.shape)   # In[5]:   print("Basic Checks!")   # In[6]:   print("Number of unique customer IDs =",data.customer_id.nunique())   # In[7]:   print("There are ", len(data), "rows in the dataset") print("Number of unique transaction IDs =",data.transaction_id.nunique())   # In[8]:   data.date.describe()   # In[9]:   pd.date_range(start="2018-08-01", end="2018-08-31").difference(data.date)   # In[10]:   data.info()   # In[11]:   print("Finding missing values!") missing = data.isnull().sum() missing = missing[missing gt; 0] missing_percentage = round(missing / len(data), 1) * 100 pd.DataFrame({"Number of missing values": missing, "Percentage": missing_percentage}).sort_values(by = "Percentage", ascending = False) pd.DataFrame({"Number of missing values": missing, "Percentage": missing_percentage}).sort_values(by = "Percentage", ascending = False)   # In[12]:   print("Basic Descriptive Statistics Of Numerical Variables") data.describe()   # In[13]:   print("Exploratory Data Analysis") data.status.value_counts(dropna = False)   # In[14]:   data.card_present_flag.value_counts(dropna=False)   # In[15]:   data.currency.value_counts(dropna=False)   # In[16]:   data.long_lat.head()   # In[17]:   data.txn_description.value_counts(dropna=False)   # In[18]:   plt.figure(figsize = (10,5)) sns.countplot(data.txn_description) plt.title("Number of transactions by category")   # In[19]:   data[["txn_description", "amount"]].groupby("txn_description", as_index = False).mean().sort_values(by="amount", ascending=False)   # In[20]:   plt.figure(figsize = (8, 5)) sns.barplot(x = "txn_description", y = "amount", data = data) plt.title("Average transaction volume by category")   # In[21]:   data.gender.value_counts(dropna = False)   # In[22]:   print("Mean Amount Spent By Each Gender") data[["gender", "amount"]].groupby("gender", as_index = False).mean().sort_values(by = "amount", ascending = False)   # In[23]:   plt.figure(figsize=(5, 5)) sns.countplot(data.gender) plt.title("Number of transactions by gender")   # In[24]:   print("Average Amount Spent By Each Gender | Visualized") plt.figure(figsize=(5, 5)) sns.barplot(x="gender", y="amount", data=data) plt.title("Average Transaction Volume By Gender")   # In[25]:   print("Merchants By Suburb In Descending Order") data.merchant_suburb.value_counts(dropna=False)   # In[26]:   print("Merchants By State In Descending Order!!!!") data.merchant_state.value_counts(dropna=False)   # In[27]:   print("Number Of Transactions By Merchant's State | Visualized") plt.figure(figsize=(12,7)) sns.countplot(data.merchant_state) plt.title("Number of transactions by state")   # In[28]:   print("Getting Merchant State and Amount ") data[["merchant_state", "amount"]].groupby("merchant_state", as_index=False).mean().sort_values(by="amount", ascending=False)   # In[29]:   print("Average Of Transaction Amount By Merchant's State | Visualized") plt.figure(figsize=(12, 7)) sns.barplot(x="merchant_state", y="amount", data=data) plt.title("Average transaction volume by state")   # In[30]:   print("Timestamp For Each Column") data.extraction.head()   # In[31]:   data[["date", "extraction"]].head()   # In[32]:   print("Country Of Transaction Or Data") data.country.value_counts(dropna=False)   # In[33]:   print("Merchant Longitude and Latitude") data.merchant_long_lat.head()   # In[34]:   print("Movement(Cr/Dr)") data.movement.value_counts(dropna=False)   # In[35]:   print("Number Of Transactions (Based On/By) Movements | Visualized Using Countplot") sns.countplot(data.movement) plt.title("Number Of Transactions By Movement")   # In[36]:   plt.figure(figsize=(5, 5)) sns.barplot(x="movement", y="amount", data=data) plt.title("Average transaction amount by movement")   # In[ ]:   print("Balance") plt.figure(figsize=(8, 5)) sns.distplot(data.balance) plt.title("Balance Distribution")   # In[ ]:   plt.figure(figsize=(12, 4)) sns.boxplot(data.balance)   # In[ ]:   print("Age Of Customers") plt.figure(figsize=(8, 5)) sns.distplot(data.age) plt.title("Age Distribution")   # In[ ]:   print("Amount Spent/Transacted") plt.figure(figsize=(8, 5)) sns.distplot(data.amount) plt.title("Amount Distribution") print("EDA Ends Here...")   # In[ ]:   print("Feature Engineering n")  print("Dropping Currency amp; Country as They are The Same And UnWanted Columns") print("Before: ", data.shape) data = data.drop(["currency", "country"], axis=1) print("After: ", data.shape)   # In[ ]:   print("Identifying Missing Values In The Columns") pd.DataFrame({"Number of missing values": missing, "Percentage": missing_percentage}).sort_values(by="Percentage", ascending=False)   # In[ ]:   print("Eliminating Transactions With No Purchases") nonSales = data.loc[(data.txn_description != "SALES-POS") | (data.txn_description != "POS"), :] nonSales.isnull().sum().sort_values(ascending = False)   # In[ ]:   print("Filling Missing Values With N/A") cols = ["card_present_flag", "merchant_state", "merchant_suburb", "merchant_id", "merchant_long_lat"] for col in cols:  data[col].fillna("n/a", inplace = True)   # In[ ]:   missing = data.isnull().sum() missing = missing[missing gt; 0] missing.sort_values(ascending = False)   # In[ ]:   data = data.drop(["merchant_code", "bpay_biller_code"], axis=1)   # In[ ]:   data.isnull().sum().max()   # In[ ]:   print("Creating DayOfWeek, Month and Hour Features") daily_amount = pd.DataFrame(data.groupby("date").amount.sum()) daily_amount.head()   # In[ ]:   fig, ax=plt.subplots(figsize=(12, 5)) ax.plot(daily_amount.index, daily_amount.amount) plt.title("Transaction Volume From 1/8/2018 to 31/10/2018") plt.xlabel("Date") plt.ylabel("Transaction Volume")   # In[ ]:   data["month"]=pd.DatetimeIndex(data.date).month data["dayofweek"]=pd.DatetimeIndex(data.date).dayofweek data[{"date", "month", "dayofweek"}].head()   # In[ ]:   data.extraction.head()   # In[ ]:   data["extraction"] = [timestamp.split("T")[1].split(".")[0] for timestamp in data.extraction] data.extraction.head()   # In[ ]:   data["hour"] = [time.split(":")[0] for time in data.extraction] data[["extraction", "hour"]].head()   # In[ ]:   print("Before: ", data.hour.dtype) data["hour"]=pd.to_numeric(data.hour) print("After: ", data.hour.dtype)   # In[ ]:   data.head()   # In[ ]:   print("Purchases And Overall Amount") purchases_amount = data.loc[(data.txn_description == "POS") | (data.txn_description == "SALES-POS"), "amount"] purchases_amount.head()   # In[ ]:   plt.figure(figsize=(8, 5)) sns.distplot(purchases_amount) plt.title("Purchase Transaction Amount")   # In[ ]:   plt.figure(figsize=(8, 5)) sns.distplot(data.amount) plt.title("Overall Transaction Amount")   # In[ ]:   data.amount.describe()   # In[ ]:   purchases_amount.describe()   # In[ ]:   print("Transaction Volume Per Customer") customer_monthly_volume=pd.DataFrame(data.groupby("customer_id").amount.sum()/3) customer_monthly_volume.head()   # In[ ]:   print("Monthly Transaction Volume | Visualized") plt.figure(figsize=(12, 5)) sns.distplot(customer_monthly_volume.amount) plt.title("Customers Monthly Transaction Volume")   # In[ ]:   print("Transaction Volume By Month | Visualized") plt.figure(figsize=(5, 5)) sns.barplot(x="month", y="amount", data=data) plt.title("Average Transaction Volume By Month")   # In[ ]:   print("Volume Throughout The Week") average_daily_volume = pd.DataFrame(data.groupby("dayofweek").amount.mean()) average_daily_volume.head()   # In[ ]:   print("Transcation Volume On An Average Day") fig, ax=plt.subplots(figsize=(8, 5)) ax.plot(average_daily_volume.index, average_daily_volume.amount) plt.title("Average Transaction Volume per Day") plt.ylabel("Transaction Volume") plt.xlabel("Day of week")   # In[ ]:   data.txn_description.value_counts()   # In[ ]:   data.loc[data.txn_description == "PAY/SALARY", "category"] = "Salary" data.loc[(data.txn_description == "SALES-POS") | (data.txn_description == "POS"), "category"] = "Purchase" data.category.fillna("Others", inplace=True) data[{"txn_description", "category"}].head(10)   # In[ ]:   stacked_barplot=pd.DataFrame(data.groupby(["dayofweek", "category"]).amount.count()) stacked_barplot.unstack().plot(kind="bar", stacked=True, figsize=(12, 7)) plt.title("Number Of Transactions Each Day By Category") plt.legend(["Others", "Purchases", "Salaries"]) plt.ylabel("Number of transactions") plt.xlabel("Day of week")   # In[ ]:   average_hourly_volume = pd.DataFrame(data.groupby("hour").amount.mean()) average_hourly_volume.head()   # In[ ]:   fig, ax=plt.subplots(figsize=(8, 5)) ax.plot(average_hourly_volume.index, average_hourly_volume.amount) plt.title("Average Transaction Volume per Hour") plt.ylabel("Transaction Volume") plt.xlabel("Hour")   # In[ ]:   stacked_barplot=pd.DataFrame(data.groupby(["hour", "category"]).amount.count()) stacked_barplot.unstack().plot(kind="bar", stacked=True, figsize=(12, 7)) plt.title("Number Of Transactions Each Hour By Category") plt.legend(["Others", "Purchases", "Salaries"]) plt.ylabel("Number of Transactions") plt.xlabel("Hour")   # In[ ]:   data[["long_lat", "merchant_long_lat"]].head()   # In[ ]:   print("Customer Coordinates") data["long_lat"]=[[coordinate.split(" ")[1], coordinate.split(" ")[0]] for coordinate in data.long_lat]  data.long_lat.head()   # In[ ]:   print("Merchant Coordinates")  print("ISSUES HEREgt; n")  print(len(data.merchant_long_lat)) data["merchant_long_lat"]=[[coords.split(" ")[1], coords.split(" ")[0]] for coords in data.merchant_long_lat] data.merchant_long_lat.head()   # In[ ]:   new_df=pd.DataFrame({'MerchantLocation':(tuple(num) for num in data.merchant_long_lat), 'CustomerLocation': (tuple(num) for num in data.long_lat)})  for index, row in new_df.iterrows():  x=row.CustomerLocation  y=row.MerchantLocation   if x[0]!="" or x[1]!="" or y[0]!="" or y[1]!="":  try:  new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (float(x[0]), float(x[1])))  new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y, (float(y[0]), float(y[1])))   except:  print("one of these should be N/A")  new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (x[0], x[1]))  new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y[0], y[0].replace('n/a','1'))  new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y[1], y[1].replace('n/a','1'))   else:  print("There is an empty string")  new_df["MerchantLocation"] = new_df.MerchantLocation.replace(float(y[0]), float(y[0]))  new_df["MerchantLocation"] = new_df.MerchantLocation.replace(float(y[1]), float(y[1]))  print(x,y)  #print(haversine((x[0], x[1]), (y[0], y[1]), unit='mi'), "miles...")  #print(haversine((x[0], x[1]), (y[0], float(y[1])), unit='mi'), "miles")  

Это мой код, и проблемы там, где я набрал «ПРОБЛЕМЫ ЗДЕСЬ». Даже несмотря на то, что список существует, как видно при запуске кода с данными фуража@ANZ в файле excel виртуальной стажировки.

P.S. Вторая проблема заключается в том, что строка не заменяется, даже если она поймана!

Комментарии:

1. Я мог бы поспорить, что coords.split(" ") по крайней мере в один момент возвращается только список длиной 0, поэтому coords.split(" ")[1] возникает ошибка.

2. А… в этом есть смысл. У большинства из них было свободное место, за исключением нескольких. Как ты думаешь, я должен сначала отбросить пустые строки? Спасибо.

3. Попробуй str.strip() .