Применить прокатку как часть расчета столбца?

#python #pandas

Вопрос:

Я ищу, чтобы создать новый столбец, который находит минимальное значение смещения (т. Е. Количество строк назад) минимального значения в определенном окне, проблема только в том, что размер окна меняется от строки к строке.

Способ найти минимальное значение смещения из текущей строки, используя статическое число, состоит в том, чтобы изменить порядок столбца, а затем применить функцию argmin со сдвигом, например так:

 df['initx1'] = df['Close'][::-1].rolling(20).apply(np.argmin, raw=True).shift(-(20   1))  

…однако то, что я хотел бы сделать, это использовать динамическое скользящее число без необходимости прибегать к использованию цикла for, я хочу, чтобы это было сделано в виде вычисления столбца, как описано выше. Psuedo-код того, что я пытаюсь сделать, выглядит так:

 df['initx1'] = df['Close'][::-1].rolling(df['dynamicWindowSize']).apply(np.argmin, raw=True).shift(-(df['dynamicWindowSize']   1))  

…но, конечно, это выдает ошибку, в которой говорится, что rolling должен быть целым числом, хотя все ряды в df[‘dynamicWindowSize’] ЯВЛЯЮТСЯ целыми числами, но я знаю, что это потому, что я передаю серию. Есть какие-нибудь идеи?

Воспроизводимый набор данных и код приведены ниже для работы с:

 import numpy as np import pandas as pd from datetime import datetime  # Create a reproducible, static dataframe. # 1 minute SPY data. Skip to the bottom... df = pd.DataFrame([  {  "time": "2021-10-26 9:30",  "open": "457.2",  "high": "457.29",  "low": "456.78",  "close": "456.9383",  "volume": "594142"  },  {  "time": "2021-10-26 9:31",  "open": "456.94",  "high": "457.07",  "low": "456.8",  "close": "456.995",  "volume": "194061"  },  {  "time": "2021-10-26 9:32",  "open": "456.99",  "high": "457.22",  "low": "456.84",  "close": "457.21",  "volume": "186114"  },  {  "time": "2021-10-26 9:33",  "open": "457.22",  "high": "457.45",  "low": "457.2011",  "close": "457.308",  "volume": "294158"  },  {  "time": "2021-10-26 9:34",  "open": "457.31",  "high": "457.4",  "low": "457.25",  "close": "457.32",  "volume": "172574"  },  {  "time": "2021-10-26 9:35",  "open": "457.31",  "high": "457.48",  "low": "457.18",  "close": "457.44",  "volume": "396668"  },  {  "time": "2021-10-26 9:36",  "open": "457.48",  "high": "457.6511",  "low": "457.44",  "close": "457.57",  "volume": "186777"  },  {  "time": "2021-10-26 9:37",  "open": "457.5699",  "high": "457.73",  "low": "457.5699",  "close": "457.69",  "volume": "187596"  },  {  "time": "2021-10-26 9:38",  "open": "457.7",  "high": "457.73",  "low": "457.54",  "close": "457.63",  "volume": "185570"  },  {  "time": "2021-10-26 9:39",  "open": "457.63",  "high": "457.64",  "low": "457.31",  "close": "457.59",  "volume": "164707"  },  {  "time": "2021-10-26 9:40",  "open": "457.59",  "high": "457.72",  "low": "457.46",  "close": "457.7199",  "volume": "167438"  },  {  "time": "2021-10-26 9:41",  "open": "457.72",  "high": "457.8",  "low": "457.68",  "close": "457.72",  "volume": "199951"  },  {  "time": "2021-10-26 9:42",  "open": "457.73",  "high": "457.74",  "low": "457.6",  "close": "457.62",  "volume": "152134"  },  {  "time": "2021-10-26 9:43",  "open": "457.6",  "high": "457.65",  "low": "457.45",  "close": "457.5077",  "volume": "142530"  },  {  "time": "2021-10-26 9:44",  "open": "457.51",  "high": "457.64",  "low": "457.4001",  "close": "457.61",  "volume": "122575"  },  {  "time": "2021-10-26 9:45",  "open": "457.61",  "high": "457.76",  "low": "457.58",  "close": "457.75",  "volume": "119886"  },  {  "time": "2021-10-26 9:46",  "open": "457.74",  "high": "457.75",  "low": "457.37",  "close": "457.38",  "volume": "183157"  },  {  "time": "2021-10-26 9:47",  "open": "457.42",  "high": "457.49",  "low": "457.37",  "close": "457.44",  "volume": "128542"  },  {  "time": "2021-10-26 9:48",  "open": "457.43",  "high": "457.49",  "low": "457.33",  "close": "457.44",  "volume": "154181"  },  {  "time": "2021-10-26 9:49",  "open": "457.43",  "high": "457.5898",  "low": "457.42",  "close": "457.47",  "volume": "163063"  },  {  "time": "2021-10-26 9:50",  "open": "457.45",  "high": "457.59",  "low": "457.44",  "close": "457.555",  "volume": "96229"  },  {  "time": "2021-10-26 9:51",  "open": "457.56",  "high": "457.61",  "low": "457.31",  "close": "457.4217",  "volume": "110380"  },  {  "time": "2021-10-26 9:52",  "open": "457.42",  "high": "457.56",  "low": "457.42",  "close": "457.47",  "volume": "107518"  },  {  "time": "2021-10-26 9:53",  "open": "457.475",  "high": "457.51",  "low": "457.4",  "close": "457.48",  "volume": "78062"  },  {  "time": "2021-10-26 9:54",  "open": "457.49",  "high": "457.57",  "low": "457.42",  "close": "457.46",  "volume": "133883"  },  {  "time": "2021-10-26 9:55",  "open": "457.47",  "high": "457.56",  "low": "457.45",  "close": "457.51",  "volume": "98998"  },  {  "time": "2021-10-26 9:56",  "open": "457.51",  "high": "457.54",  "low": "457.43",  "close": "457.43",  "volume": "110237"  },  {  "time": "2021-10-26 9:57",  "open": "457.43",  "high": "457.65",  "low": "457.375",  "close": "457.65",  "volume": "98794"  },  {  "time": "2021-10-26 9:58",  "open": "457.66",  "high": "457.69",  "low": "457.35",  "close": "457.45",  "volume": "262154"  },  {  "time": "2021-10-26 9:59",  "open": "457.45",  "high": "457.47",  "low": "457.33",  "close": "457.4",  "volume": "74685"  },  {  "time": "2021-10-26 10:00",  "open": "457.41",  "high": "457.48",  "low": "457.18",  "close": "457.38",  "volume": "166617"  },  {  "time": "2021-10-26 10:01",  "open": "457.39",  "high": "457.7",  "low": "457.39",  "close": "457.5",  "volume": "265649"  },  {  "time": "2021-10-26 10:02",  "open": "457.51",  "high": "457.57",  "low": "457.39",  "close": "457.53",  "volume": "131947"  },  {  "time": "2021-10-26 10:03",  "open": "457.53",  "high": "457.54",  "low": "457.4",  "close": "457.51",  "volume": "80111"  },  {  "time": "2021-10-26 10:04",  "open": "457.51",  "high": "457.62",  "low": "457.5",  "close": "457.6101",  "volume": "117174"  },  {  "time": "2021-10-26 10:05",  "open": "457.621",  "high": "457.64",  "low": "457.51",  "close": "457.58",  "volume": "168758"  },  {  "time": "2021-10-26 10:06",  "open": "457.58",  "high": "457.64",  "low": "457.46",  "close": "457.61",  "volume": "84076"  },  {  "time": "2021-10-26 10:07",  "open": "457.62",  "high": "457.7401",  "low": "457.62",  "close": "457.66",  "volume": "125156"  },  {  "time": "2021-10-26 10:08",  "open": "457.665",  "high": "457.69",  "low": "457.5",  "close": "457.67",  "volume": "116919"  },  {  "time": "2021-10-26 10:09",  "open": "457.69",  "high": "457.72",  "low": "457.5",  "close": "457.57",  "volume": "102551"  },  {  "time": "2021-10-26 10:10",  "open": "457.56",  "high": "457.75",  "low": "457.56",  "close": "457.7",  "volume": "109165"  },  {  "time": "2021-10-26 10:11",  "open": "457.7",  "high": "457.725",  "low": "457.63",  "close": "457.66",  "volume": "146209"  },  {  "time": "2021-10-26 10:12",  "open": "457.665",  "high": "457.88",  "low": "457.64",  "close": "457.86",  "volume": "210620"  },  {  "time": "2021-10-26 10:13",  "open": "457.855",  "high": "457.96",  "low": "457.83",  "close": "457.95",  "volume": "159975"  },  {  "time": "2021-10-26 10:14",  "open": "457.95",  "high": "458.02",  "low": "457.93",  "close": "457.95",  "volume": "152042"  },  {  "time": "2021-10-26 10:15",  "open": "457.96",  "high": "458.15",  "low": "457.96",  "close": "458.08",  "volume": "146047"  },  {  "time": "2021-10-26 10:16",  "open": "458.085",  "high": "458.17",  "low": "457.99",  "close": "458.15",  "volume": "100732"  },  {  "time": "2021-10-26 10:17",  "open": "458.17",  "high": "458.33",  "low": "458.155",  "close": "458.245",  "volume": "235072"  },  {  "time": "2021-10-26 10:18",  "open": "458.25",  "high": "458.29",  "low": "458.14",  "close": "458.16",  "volume": "422002"  },  {  "time": "2021-10-26 10:19",  "open": "458.17",  "high": "458.2801",  "low": "458.1699",  "close": "458.28",  "volume": "114611"  },  {  "time": "2021-10-26 10:20",  "open": "458.29",  "high": "458.39",  "low": "458.24",  "close": "458.37",  "volume": "241797"  },  {  "time": "2021-10-26 10:21",  "open": "458.37",  "high": "458.42",  "low": "458.31",  "close": "458.345",  "volume": "124824"  },  {  "time": "2021-10-26 10:22",  "open": "458.33",  "high": "458.49",  "low": "458.33",  "close": "458.47",  "volume": "132125"  } ])  # Convert df to numeric and time to datetime re: the .csv to .json # converter tool I used online... df[['open','high','low','close','volume']] = df[['open','high','low','close','volume']].apply(pd.to_numeric) df['time'] = pd.to_datetime(df['time'])  # Define an initial x1 value. This is the minimum offset index number for the min # of the last 20 rows. So, looking at the previous 20 rows, INCLUDING the current # one, if the minimum close value happened 5 bars ago, this number would be 5. # If the current bar holds the lowest close over the last 20 bars, this number # would be 0. This is not important to the question as this part works as is. df['initx1'] = df['close'][::-1].rolling(20).apply(np.argmax, raw=True).shift(-(20   1))  # Part of the strategy I'm wanting to emulate requires to get the second last initx1 # value and to add 2 to that number, so that's what I'm doing here df['x1'] = df['initx1'].shift(2)   2  # 'dynamicWindowSize' is called 'x2' and is just 1   x1 df['x2'] = df['x1']   1  # To mitigate the error "trying to set rolling on an empty sequence" later, # just drop nan rows from x1 and x2 now. Will only be the top 20 rows anyway. df = df.dropna(subset=['x1', 'x2'])  # Convert x2 to integer df['x2'] = df['x2'].astype(int)  # Inspect print(df)  # This is the pseudo code of what I want. It doesn't work now obviously, # but the rolling window size just needs to be the value in x2 for each # row df['lfOffset'] = df['close'][::-1].rolling(df['x2']).apply(np.argmin, raw=True).shift(-(df['x2']   1))  # # I found an answer from https://stackoverflow.com/questions/49955008/python-pandas-rolling-mean-with-window-value-in-another-column # # that I tried modifying for my purposes here, but also doesn't work. # def get_lfOffset_min(x): # win = df.loc[:,'x2'].iloc[x.shape[0]-1].astype('int') # win = max(win, 0) # return pd.Series(x).rolling(window = win).apply(np.argmin, raw=True).shift(-(win   1)) #.mean().iloc[-1] # df.loc[:,'lfOffset'] = df.loc[:,'close'].expanding().apply(get_lfOffset_min)  

Комментарии:

1. почему бы вам не попробовать .rolling(int(df[‘dynamicWindowSize’]))

2. @tdy Да, это похожая концепция, но эта функция agg с лямбда-вещами немного выше моего понимания, поэтому я не уверен, как заставить это работать здесь?

3. @deirdreamuel TypeError: cannot convert the series to lt;class 'int'gt;

4. таким образом, df[‘dynamicWindowSize’] — это не целое число, а ряд. В коде, который у вас есть, у вас есть только «открыть», «высокий», «низкий»,»закрыть»,»громкость» и время. Как вы устанавливаете df[‘dynamicWindowSize’]?

5. Да, как сказано в вопросе, я не пытаюсь использовать статическое число, я хочу основывать расчет столбца на значениях в столбце «dynamicWindowSize», который для этой цели можно считать набором случайных целых чисел. Если вам нужно, используйте это для создания столбца случайных чисел: data = np.random.randint(0, 10, size=len(df)) df['dynamicWindowSize'] = data

Ответ №1:

Обычно нельзя смотреть на два столбца внутри функции окна (функция, поставляемая в .apply() ). Начиная с Pandas 1.3 существует исключение: если использовать Numba и указать method="table" , в функцию передается весь фрейм данных. Он передается в виде массива, что делает код несколько уродливым, но он выполняет свою работу.

 import numba import math  @numba.njit def compute_offset(arr):  window_size = arr[-1, 1]  min_ = math.inf  min_idx = None  window_start = max(arr.shape[0] - window_size, 0)    for i in range(arr.shape[0] - 1, window_start - 1, -1):  if arr[i, 0] lt; min_:  min_ = arr[i, 0]  min_idx = i    result = np.empty(3)  result[:2] = arr[-1, :2]  result[2] = arr.shape[0] - 1 - min_idx # the offset    return result  offset = (  df[["close", "x2"]]  .assign(offset=-1)  .expanding(method="table")  .apply(compute_offset, raw=True, engine='numba')  .offset )  df_processed = df.join(offset)  

Вот первые 20 строк результата, вычисленного на основе предоставленных входных данных:

введите описание изображения здесь