#python #pandas #dataframe #transactions
Вопрос:
Merchant Coordinates ISSUES HEREgt; 12043 --------------------------------------------------------------------------- IndexError Traceback (most recent call last) lt;ipython-input-74-3255d6529f5dgt; in lt;modulegt; 4 5 print(len(data.merchant_long_lat)) ----gt; 6 data["merchant_long_lat"]=[[coords.split(" ")[1], coords.split(" ")[0]] for coords in data.merchant_long_lat] 7 data.merchant_long_lat.head() lt;ipython-input-74-3255d6529f5dgt; in lt;listcompgt;(.0) 4 5 print(len(data.merchant_long_lat)) ----gt; 6 data["merchant_long_lat"]=[[coords.split(" ")[1], coords.split(" ")[0]] for coords in data.merchant_long_lat] 7 data.merchant_long_lat.head() IndexError: list index out of range
Это проблема, с которой я в настоящее время сталкиваюсь
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from haversine import haversine, Unit # In[2]: data = pd.read_excel('C:/Users/Intel/Downloads/ANZ.xlsx') data.head() # In[3]: pd.DataFrame({"Column": data.columns}) # In[4]: print("Data shape", data.shape) # In[5]: print("Basic Checks!") # In[6]: print("Number of unique customer IDs =",data.customer_id.nunique()) # In[7]: print("There are ", len(data), "rows in the dataset") print("Number of unique transaction IDs =",data.transaction_id.nunique()) # In[8]: data.date.describe() # In[9]: pd.date_range(start="2018-08-01", end="2018-08-31").difference(data.date) # In[10]: data.info() # In[11]: print("Finding missing values!") missing = data.isnull().sum() missing = missing[missing gt; 0] missing_percentage = round(missing / len(data), 1) * 100 pd.DataFrame({"Number of missing values": missing, "Percentage": missing_percentage}).sort_values(by = "Percentage", ascending = False) pd.DataFrame({"Number of missing values": missing, "Percentage": missing_percentage}).sort_values(by = "Percentage", ascending = False) # In[12]: print("Basic Descriptive Statistics Of Numerical Variables") data.describe() # In[13]: print("Exploratory Data Analysis") data.status.value_counts(dropna = False) # In[14]: data.card_present_flag.value_counts(dropna=False) # In[15]: data.currency.value_counts(dropna=False) # In[16]: data.long_lat.head() # In[17]: data.txn_description.value_counts(dropna=False) # In[18]: plt.figure(figsize = (10,5)) sns.countplot(data.txn_description) plt.title("Number of transactions by category") # In[19]: data[["txn_description", "amount"]].groupby("txn_description", as_index = False).mean().sort_values(by="amount", ascending=False) # In[20]: plt.figure(figsize = (8, 5)) sns.barplot(x = "txn_description", y = "amount", data = data) plt.title("Average transaction volume by category") # In[21]: data.gender.value_counts(dropna = False) # In[22]: print("Mean Amount Spent By Each Gender") data[["gender", "amount"]].groupby("gender", as_index = False).mean().sort_values(by = "amount", ascending = False) # In[23]: plt.figure(figsize=(5, 5)) sns.countplot(data.gender) plt.title("Number of transactions by gender") # In[24]: print("Average Amount Spent By Each Gender | Visualized") plt.figure(figsize=(5, 5)) sns.barplot(x="gender", y="amount", data=data) plt.title("Average Transaction Volume By Gender") # In[25]: print("Merchants By Suburb In Descending Order") data.merchant_suburb.value_counts(dropna=False) # In[26]: print("Merchants By State In Descending Order!!!!") data.merchant_state.value_counts(dropna=False) # In[27]: print("Number Of Transactions By Merchant's State | Visualized") plt.figure(figsize=(12,7)) sns.countplot(data.merchant_state) plt.title("Number of transactions by state") # In[28]: print("Getting Merchant State and Amount ") data[["merchant_state", "amount"]].groupby("merchant_state", as_index=False).mean().sort_values(by="amount", ascending=False) # In[29]: print("Average Of Transaction Amount By Merchant's State | Visualized") plt.figure(figsize=(12, 7)) sns.barplot(x="merchant_state", y="amount", data=data) plt.title("Average transaction volume by state") # In[30]: print("Timestamp For Each Column") data.extraction.head() # In[31]: data[["date", "extraction"]].head() # In[32]: print("Country Of Transaction Or Data") data.country.value_counts(dropna=False) # In[33]: print("Merchant Longitude and Latitude") data.merchant_long_lat.head() # In[34]: print("Movement(Cr/Dr)") data.movement.value_counts(dropna=False) # In[35]: print("Number Of Transactions (Based On/By) Movements | Visualized Using Countplot") sns.countplot(data.movement) plt.title("Number Of Transactions By Movement") # In[36]: plt.figure(figsize=(5, 5)) sns.barplot(x="movement", y="amount", data=data) plt.title("Average transaction amount by movement") # In[ ]: print("Balance") plt.figure(figsize=(8, 5)) sns.distplot(data.balance) plt.title("Balance Distribution") # In[ ]: plt.figure(figsize=(12, 4)) sns.boxplot(data.balance) # In[ ]: print("Age Of Customers") plt.figure(figsize=(8, 5)) sns.distplot(data.age) plt.title("Age Distribution") # In[ ]: print("Amount Spent/Transacted") plt.figure(figsize=(8, 5)) sns.distplot(data.amount) plt.title("Amount Distribution") print("EDA Ends Here...") # In[ ]: print("Feature Engineering n") print("Dropping Currency amp; Country as They are The Same And UnWanted Columns") print("Before: ", data.shape) data = data.drop(["currency", "country"], axis=1) print("After: ", data.shape) # In[ ]: print("Identifying Missing Values In The Columns") pd.DataFrame({"Number of missing values": missing, "Percentage": missing_percentage}).sort_values(by="Percentage", ascending=False) # In[ ]: print("Eliminating Transactions With No Purchases") nonSales = data.loc[(data.txn_description != "SALES-POS") | (data.txn_description != "POS"), :] nonSales.isnull().sum().sort_values(ascending = False) # In[ ]: print("Filling Missing Values With N/A") cols = ["card_present_flag", "merchant_state", "merchant_suburb", "merchant_id", "merchant_long_lat"] for col in cols: data[col].fillna("n/a", inplace = True) # In[ ]: missing = data.isnull().sum() missing = missing[missing gt; 0] missing.sort_values(ascending = False) # In[ ]: data = data.drop(["merchant_code", "bpay_biller_code"], axis=1) # In[ ]: data.isnull().sum().max() # In[ ]: print("Creating DayOfWeek, Month and Hour Features") daily_amount = pd.DataFrame(data.groupby("date").amount.sum()) daily_amount.head() # In[ ]: fig, ax=plt.subplots(figsize=(12, 5)) ax.plot(daily_amount.index, daily_amount.amount) plt.title("Transaction Volume From 1/8/2018 to 31/10/2018") plt.xlabel("Date") plt.ylabel("Transaction Volume") # In[ ]: data["month"]=pd.DatetimeIndex(data.date).month data["dayofweek"]=pd.DatetimeIndex(data.date).dayofweek data[{"date", "month", "dayofweek"}].head() # In[ ]: data.extraction.head() # In[ ]: data["extraction"] = [timestamp.split("T")[1].split(".")[0] for timestamp in data.extraction] data.extraction.head() # In[ ]: data["hour"] = [time.split(":")[0] for time in data.extraction] data[["extraction", "hour"]].head() # In[ ]: print("Before: ", data.hour.dtype) data["hour"]=pd.to_numeric(data.hour) print("After: ", data.hour.dtype) # In[ ]: data.head() # In[ ]: print("Purchases And Overall Amount") purchases_amount = data.loc[(data.txn_description == "POS") | (data.txn_description == "SALES-POS"), "amount"] purchases_amount.head() # In[ ]: plt.figure(figsize=(8, 5)) sns.distplot(purchases_amount) plt.title("Purchase Transaction Amount") # In[ ]: plt.figure(figsize=(8, 5)) sns.distplot(data.amount) plt.title("Overall Transaction Amount") # In[ ]: data.amount.describe() # In[ ]: purchases_amount.describe() # In[ ]: print("Transaction Volume Per Customer") customer_monthly_volume=pd.DataFrame(data.groupby("customer_id").amount.sum()/3) customer_monthly_volume.head() # In[ ]: print("Monthly Transaction Volume | Visualized") plt.figure(figsize=(12, 5)) sns.distplot(customer_monthly_volume.amount) plt.title("Customers Monthly Transaction Volume") # In[ ]: print("Transaction Volume By Month | Visualized") plt.figure(figsize=(5, 5)) sns.barplot(x="month", y="amount", data=data) plt.title("Average Transaction Volume By Month") # In[ ]: print("Volume Throughout The Week") average_daily_volume = pd.DataFrame(data.groupby("dayofweek").amount.mean()) average_daily_volume.head() # In[ ]: print("Transcation Volume On An Average Day") fig, ax=plt.subplots(figsize=(8, 5)) ax.plot(average_daily_volume.index, average_daily_volume.amount) plt.title("Average Transaction Volume per Day") plt.ylabel("Transaction Volume") plt.xlabel("Day of week") # In[ ]: data.txn_description.value_counts() # In[ ]: data.loc[data.txn_description == "PAY/SALARY", "category"] = "Salary" data.loc[(data.txn_description == "SALES-POS") | (data.txn_description == "POS"), "category"] = "Purchase" data.category.fillna("Others", inplace=True) data[{"txn_description", "category"}].head(10) # In[ ]: stacked_barplot=pd.DataFrame(data.groupby(["dayofweek", "category"]).amount.count()) stacked_barplot.unstack().plot(kind="bar", stacked=True, figsize=(12, 7)) plt.title("Number Of Transactions Each Day By Category") plt.legend(["Others", "Purchases", "Salaries"]) plt.ylabel("Number of transactions") plt.xlabel("Day of week") # In[ ]: average_hourly_volume = pd.DataFrame(data.groupby("hour").amount.mean()) average_hourly_volume.head() # In[ ]: fig, ax=plt.subplots(figsize=(8, 5)) ax.plot(average_hourly_volume.index, average_hourly_volume.amount) plt.title("Average Transaction Volume per Hour") plt.ylabel("Transaction Volume") plt.xlabel("Hour") # In[ ]: stacked_barplot=pd.DataFrame(data.groupby(["hour", "category"]).amount.count()) stacked_barplot.unstack().plot(kind="bar", stacked=True, figsize=(12, 7)) plt.title("Number Of Transactions Each Hour By Category") plt.legend(["Others", "Purchases", "Salaries"]) plt.ylabel("Number of Transactions") plt.xlabel("Hour") # In[ ]: data[["long_lat", "merchant_long_lat"]].head() # In[ ]: print("Customer Coordinates") data["long_lat"]=[[coordinate.split(" ")[1], coordinate.split(" ")[0]] for coordinate in data.long_lat] data.long_lat.head() # In[ ]: print("Merchant Coordinates") print("ISSUES HEREgt; n") print(len(data.merchant_long_lat)) data["merchant_long_lat"]=[[coords.split(" ")[1], coords.split(" ")[0]] for coords in data.merchant_long_lat] data.merchant_long_lat.head() # In[ ]: new_df=pd.DataFrame({'MerchantLocation':(tuple(num) for num in data.merchant_long_lat), 'CustomerLocation': (tuple(num) for num in data.long_lat)}) for index, row in new_df.iterrows(): x=row.CustomerLocation y=row.MerchantLocation if x[0]!="" or x[1]!="" or y[0]!="" or y[1]!="": try: new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (float(x[0]), float(x[1]))) new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y, (float(y[0]), float(y[1]))) except: print("one of these should be N/A") new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (x[0], x[1])) new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y[0], y[0].replace('n/a','1')) new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y[1], y[1].replace('n/a','1')) else: print("There is an empty string") new_df["MerchantLocation"] = new_df.MerchantLocation.replace(float(y[0]), float(y[0])) new_df["MerchantLocation"] = new_df.MerchantLocation.replace(float(y[1]), float(y[1])) print(x,y) #print(haversine((x[0], x[1]), (y[0], y[1]), unit='mi'), "miles...") #print(haversine((x[0], x[1]), (y[0], float(y[1])), unit='mi'), "miles")
Это мой код, и проблемы там, где я набрал «ПРОБЛЕМЫ ЗДЕСЬ». Даже несмотря на то, что список существует, как видно при запуске кода с данными фуража@ANZ в файле excel виртуальной стажировки.
P.S. Вторая проблема заключается в том, что строка не заменяется, даже если она поймана!
Комментарии:
1. Я мог бы поспорить, что
coords.split(" ")
по крайней мере в один момент возвращается только список длиной 0, поэтомуcoords.split(" ")[1]
возникает ошибка.2. А… в этом есть смысл. У большинства из них было свободное место, за исключением нескольких. Как ты думаешь, я должен сначала отбросить пустые строки? Спасибо.
3. Попробуй
str.strip()
.