#python #pandas
Вопрос:
Я ищу, чтобы создать новый столбец, который находит минимальное значение смещения (т. Е. Количество строк назад) минимального значения в определенном окне, проблема только в том, что размер окна меняется от строки к строке.
Способ найти минимальное значение смещения из текущей строки, используя статическое число, состоит в том, чтобы изменить порядок столбца, а затем применить функцию argmin со сдвигом, например так:
df['initx1'] = df['Close'][::-1].rolling(20).apply(np.argmin, raw=True).shift(-(20 1))
…однако то, что я хотел бы сделать, это использовать динамическое скользящее число без необходимости прибегать к использованию цикла for, я хочу, чтобы это было сделано в виде вычисления столбца, как описано выше. Psuedo-код того, что я пытаюсь сделать, выглядит так:
df['initx1'] = df['Close'][::-1].rolling(df['dynamicWindowSize']).apply(np.argmin, raw=True).shift(-(df['dynamicWindowSize'] 1))
…но, конечно, это выдает ошибку, в которой говорится, что rolling должен быть целым числом, хотя все ряды в df[‘dynamicWindowSize’] ЯВЛЯЮТСЯ целыми числами, но я знаю, что это потому, что я передаю серию. Есть какие-нибудь идеи?
Воспроизводимый набор данных и код приведены ниже для работы с:
import numpy as np import pandas as pd from datetime import datetime # Create a reproducible, static dataframe. # 1 minute SPY data. Skip to the bottom... df = pd.DataFrame([ { "time": "2021-10-26 9:30", "open": "457.2", "high": "457.29", "low": "456.78", "close": "456.9383", "volume": "594142" }, { "time": "2021-10-26 9:31", "open": "456.94", "high": "457.07", "low": "456.8", "close": "456.995", "volume": "194061" }, { "time": "2021-10-26 9:32", "open": "456.99", "high": "457.22", "low": "456.84", "close": "457.21", "volume": "186114" }, { "time": "2021-10-26 9:33", "open": "457.22", "high": "457.45", "low": "457.2011", "close": "457.308", "volume": "294158" }, { "time": "2021-10-26 9:34", "open": "457.31", "high": "457.4", "low": "457.25", "close": "457.32", "volume": "172574" }, { "time": "2021-10-26 9:35", "open": "457.31", "high": "457.48", "low": "457.18", "close": "457.44", "volume": "396668" }, { "time": "2021-10-26 9:36", "open": "457.48", "high": "457.6511", "low": "457.44", "close": "457.57", "volume": "186777" }, { "time": "2021-10-26 9:37", "open": "457.5699", "high": "457.73", "low": "457.5699", "close": "457.69", "volume": "187596" }, { "time": "2021-10-26 9:38", "open": "457.7", "high": "457.73", "low": "457.54", "close": "457.63", "volume": "185570" }, { "time": "2021-10-26 9:39", "open": "457.63", "high": "457.64", "low": "457.31", "close": "457.59", "volume": "164707" }, { "time": "2021-10-26 9:40", "open": "457.59", "high": "457.72", "low": "457.46", "close": "457.7199", "volume": "167438" }, { "time": "2021-10-26 9:41", "open": "457.72", "high": "457.8", "low": "457.68", "close": "457.72", "volume": "199951" }, { "time": "2021-10-26 9:42", "open": "457.73", "high": "457.74", "low": "457.6", "close": "457.62", "volume": "152134" }, { "time": "2021-10-26 9:43", "open": "457.6", "high": "457.65", "low": "457.45", "close": "457.5077", "volume": "142530" }, { "time": "2021-10-26 9:44", "open": "457.51", "high": "457.64", "low": "457.4001", "close": "457.61", "volume": "122575" }, { "time": "2021-10-26 9:45", "open": "457.61", "high": "457.76", "low": "457.58", "close": "457.75", "volume": "119886" }, { "time": "2021-10-26 9:46", "open": "457.74", "high": "457.75", "low": "457.37", "close": "457.38", "volume": "183157" }, { "time": "2021-10-26 9:47", "open": "457.42", "high": "457.49", "low": "457.37", "close": "457.44", "volume": "128542" }, { "time": "2021-10-26 9:48", "open": "457.43", "high": "457.49", "low": "457.33", "close": "457.44", "volume": "154181" }, { "time": "2021-10-26 9:49", "open": "457.43", "high": "457.5898", "low": "457.42", "close": "457.47", "volume": "163063" }, { "time": "2021-10-26 9:50", "open": "457.45", "high": "457.59", "low": "457.44", "close": "457.555", "volume": "96229" }, { "time": "2021-10-26 9:51", "open": "457.56", "high": "457.61", "low": "457.31", "close": "457.4217", "volume": "110380" }, { "time": "2021-10-26 9:52", "open": "457.42", "high": "457.56", "low": "457.42", "close": "457.47", "volume": "107518" }, { "time": "2021-10-26 9:53", "open": "457.475", "high": "457.51", "low": "457.4", "close": "457.48", "volume": "78062" }, { "time": "2021-10-26 9:54", "open": "457.49", "high": "457.57", "low": "457.42", "close": "457.46", "volume": "133883" }, { "time": "2021-10-26 9:55", "open": "457.47", "high": "457.56", "low": "457.45", "close": "457.51", "volume": "98998" }, { "time": "2021-10-26 9:56", "open": "457.51", "high": "457.54", "low": "457.43", "close": "457.43", "volume": "110237" }, { "time": "2021-10-26 9:57", "open": "457.43", "high": "457.65", "low": "457.375", "close": "457.65", "volume": "98794" }, { "time": "2021-10-26 9:58", "open": "457.66", "high": "457.69", "low": "457.35", "close": "457.45", "volume": "262154" }, { "time": "2021-10-26 9:59", "open": "457.45", "high": "457.47", "low": "457.33", "close": "457.4", "volume": "74685" }, { "time": "2021-10-26 10:00", "open": "457.41", "high": "457.48", "low": "457.18", "close": "457.38", "volume": "166617" }, { "time": "2021-10-26 10:01", "open": "457.39", "high": "457.7", "low": "457.39", "close": "457.5", "volume": "265649" }, { "time": "2021-10-26 10:02", "open": "457.51", "high": "457.57", "low": "457.39", "close": "457.53", "volume": "131947" }, { "time": "2021-10-26 10:03", "open": "457.53", "high": "457.54", "low": "457.4", "close": "457.51", "volume": "80111" }, { "time": "2021-10-26 10:04", "open": "457.51", "high": "457.62", "low": "457.5", "close": "457.6101", "volume": "117174" }, { "time": "2021-10-26 10:05", "open": "457.621", "high": "457.64", "low": "457.51", "close": "457.58", "volume": "168758" }, { "time": "2021-10-26 10:06", "open": "457.58", "high": "457.64", "low": "457.46", "close": "457.61", "volume": "84076" }, { "time": "2021-10-26 10:07", "open": "457.62", "high": "457.7401", "low": "457.62", "close": "457.66", "volume": "125156" }, { "time": "2021-10-26 10:08", "open": "457.665", "high": "457.69", "low": "457.5", "close": "457.67", "volume": "116919" }, { "time": "2021-10-26 10:09", "open": "457.69", "high": "457.72", "low": "457.5", "close": "457.57", "volume": "102551" }, { "time": "2021-10-26 10:10", "open": "457.56", "high": "457.75", "low": "457.56", "close": "457.7", "volume": "109165" }, { "time": "2021-10-26 10:11", "open": "457.7", "high": "457.725", "low": "457.63", "close": "457.66", "volume": "146209" }, { "time": "2021-10-26 10:12", "open": "457.665", "high": "457.88", "low": "457.64", "close": "457.86", "volume": "210620" }, { "time": "2021-10-26 10:13", "open": "457.855", "high": "457.96", "low": "457.83", "close": "457.95", "volume": "159975" }, { "time": "2021-10-26 10:14", "open": "457.95", "high": "458.02", "low": "457.93", "close": "457.95", "volume": "152042" }, { "time": "2021-10-26 10:15", "open": "457.96", "high": "458.15", "low": "457.96", "close": "458.08", "volume": "146047" }, { "time": "2021-10-26 10:16", "open": "458.085", "high": "458.17", "low": "457.99", "close": "458.15", "volume": "100732" }, { "time": "2021-10-26 10:17", "open": "458.17", "high": "458.33", "low": "458.155", "close": "458.245", "volume": "235072" }, { "time": "2021-10-26 10:18", "open": "458.25", "high": "458.29", "low": "458.14", "close": "458.16", "volume": "422002" }, { "time": "2021-10-26 10:19", "open": "458.17", "high": "458.2801", "low": "458.1699", "close": "458.28", "volume": "114611" }, { "time": "2021-10-26 10:20", "open": "458.29", "high": "458.39", "low": "458.24", "close": "458.37", "volume": "241797" }, { "time": "2021-10-26 10:21", "open": "458.37", "high": "458.42", "low": "458.31", "close": "458.345", "volume": "124824" }, { "time": "2021-10-26 10:22", "open": "458.33", "high": "458.49", "low": "458.33", "close": "458.47", "volume": "132125" } ]) # Convert df to numeric and time to datetime re: the .csv to .json # converter tool I used online... df[['open','high','low','close','volume']] = df[['open','high','low','close','volume']].apply(pd.to_numeric) df['time'] = pd.to_datetime(df['time']) # Define an initial x1 value. This is the minimum offset index number for the min # of the last 20 rows. So, looking at the previous 20 rows, INCLUDING the current # one, if the minimum close value happened 5 bars ago, this number would be 5. # If the current bar holds the lowest close over the last 20 bars, this number # would be 0. This is not important to the question as this part works as is. df['initx1'] = df['close'][::-1].rolling(20).apply(np.argmax, raw=True).shift(-(20 1)) # Part of the strategy I'm wanting to emulate requires to get the second last initx1 # value and to add 2 to that number, so that's what I'm doing here df['x1'] = df['initx1'].shift(2) 2 # 'dynamicWindowSize' is called 'x2' and is just 1 x1 df['x2'] = df['x1'] 1 # To mitigate the error "trying to set rolling on an empty sequence" later, # just drop nan rows from x1 and x2 now. Will only be the top 20 rows anyway. df = df.dropna(subset=['x1', 'x2']) # Convert x2 to integer df['x2'] = df['x2'].astype(int) # Inspect print(df) # This is the pseudo code of what I want. It doesn't work now obviously, # but the rolling window size just needs to be the value in x2 for each # row df['lfOffset'] = df['close'][::-1].rolling(df['x2']).apply(np.argmin, raw=True).shift(-(df['x2'] 1)) # # I found an answer from https://stackoverflow.com/questions/49955008/python-pandas-rolling-mean-with-window-value-in-another-column # # that I tried modifying for my purposes here, but also doesn't work. # def get_lfOffset_min(x): # win = df.loc[:,'x2'].iloc[x.shape[0]-1].astype('int') # win = max(win, 0) # return pd.Series(x).rolling(window = win).apply(np.argmin, raw=True).shift(-(win 1)) #.mean().iloc[-1] # df.loc[:,'lfOffset'] = df.loc[:,'close'].expanding().apply(get_lfOffset_min)
Комментарии:
1. почему бы вам не попробовать .rolling(int(df[‘dynamicWindowSize’]))
2. @tdy Да, это похожая концепция, но эта функция agg с лямбда-вещами немного выше моего понимания, поэтому я не уверен, как заставить это работать здесь?
3. @deirdreamuel
TypeError: cannot convert the series to lt;class 'int'gt;
4. таким образом, df[‘dynamicWindowSize’] — это не целое число, а ряд. В коде, который у вас есть, у вас есть только «открыть», «высокий», «низкий»,»закрыть»,»громкость» и время. Как вы устанавливаете df[‘dynamicWindowSize’]?
5. Да, как сказано в вопросе, я не пытаюсь использовать статическое число, я хочу основывать расчет столбца на значениях в столбце «dynamicWindowSize», который для этой цели можно считать набором случайных целых чисел. Если вам нужно, используйте это для создания столбца случайных чисел:
data = np.random.randint(0, 10, size=len(df)) df['dynamicWindowSize'] = data
Ответ №1:
Обычно нельзя смотреть на два столбца внутри функции окна (функция, поставляемая в .apply()
). Начиная с Pandas 1.3 существует исключение: если использовать Numba и указать method="table"
, в функцию передается весь фрейм данных. Он передается в виде массива, что делает код несколько уродливым, но он выполняет свою работу.
import numba import math @numba.njit def compute_offset(arr): window_size = arr[-1, 1] min_ = math.inf min_idx = None window_start = max(arr.shape[0] - window_size, 0) for i in range(arr.shape[0] - 1, window_start - 1, -1): if arr[i, 0] lt; min_: min_ = arr[i, 0] min_idx = i result = np.empty(3) result[:2] = arr[-1, :2] result[2] = arr.shape[0] - 1 - min_idx # the offset return result offset = ( df[["close", "x2"]] .assign(offset=-1) .expanding(method="table") .apply(compute_offset, raw=True, engine='numba') .offset ) df_processed = df.join(offset)
Вот первые 20 строк результата, вычисленного на основе предоставленных входных данных: