#python-3.x #slurm #visdom
#python-3.x #глубокое обучение #slurm #sbatch #visdom
Вопрос:
Я хотел бы использовать visdom
для визуализации результатов алгоритм глубокого обучения, который обучается на удаленном кластерном сервере. Я нашел ссылку, в которой пытался описать правильный способ настройки всего в скрипте slurm.
python -u Script.py --visdom_server http://176.97.99.618 --visdom_port 8097
Я использую свой ip и 8097 для подключения к удаленному кластерному серверу:
ssh -L 8097:176.97.99.618:8097 my_userid@r@my_server_address
У меня есть следующие строки кода:
import visdom
import numpy as np
cfg = {"server": "176.97.99.618",
"port": 8097}
vis = visdom.Visdom('http://' cfg["server"], port = cfg["port"])
win = None
#Plotting on remote server
def update_viz(epoch, loss, title):
global win
if win is None:
title = title
win = viz.line(
X=np.array([epoch]),
Y=np.array([loss]),
win=title,
opts=dict(
title=title,
fillarea=True
)
)
else:
viz.line(
X=np.array([epoch]),
Y=np.array([loss]),
win=win,
update='append'
)
update_viz(epoch, elbo2.item(), 'ELBO2 Loss of beta distributions')
Я получил эту ошибку:
Setting up a new session...
Traceback (most recent call last):
File "/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 174, in _ne
w_conn
conn = connection.create_connection(
File "/anaconda3/lib/python3.8/site-packages/urllib3/util/connection.py", line 96, in
create_connection
raise err
File "/anaconda3/lib/python3.8/site-packages/urllib3/util/connection.py", line 86, in
create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in
urlopen
httplib_response = self._make_request(
File "/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 394, in
_make_request
conn.request(method, url, **httplib_request_kw)
File "/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 239, in req
uest
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/anaconda3/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/anaconda3/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/anaconda3/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/anaconda3/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/anaconda3/lib/python3.8/http/client.py", line 950, in send
self.connect()
File "/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 205, in con
nect
conn = self._new_conn()
File "/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 186, in _ne
w_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7ff292f14d00
>: Failed to establish a new connection: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 755, in
urlopen
retries = retries.increment(
File "/anaconda3/lib/python3.8/site-packages/urllib3/util/retry.py", line 574, in inc
rement
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='176.97.99.618', port=8097): Max retries
exceeded with url: /env/main (Caused by NewConnectionError('<urllib3.connection.HTTPConnection obj
ect at 0x7ff292f14d00>: Failed to establish a new connection: [Errno 110] Connection timed out'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anaconda3/lib/python3.8/site-packages/visdom/__init__.py", line 708, in _send
return self._handle_post(
File "/anaconda3/lib/python3.8/site-packages/visdom/__init__.py", line 677, in _handl
e_post
r = self.session.post(url, data=data)
File "/anaconda3/lib/python3.8/site-packages/requests/sessions.py", line 590, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/anaconda3/lib/python3.8/site-packages/requests/sessions.py", line 542, in requ
est
resp = self.send(prep, **send_kwargs)
File "/anaconda3/lib/python3.8/site-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/anaconda3/lib/python3.8/site-packages/requests/adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='192.168.2.10', port=8097): Max retri
es exceeded with url: /env/main (Caused by NewConnectionError('<urllib3.connection.HTTPConnection
object at 0x7ff292f14d00>: Failed to establish a new connection: [Errno 110] Connection timed out'
))
Visdom python client failed to establish socket to get messages from the server. This feature is o
ptional and can be disabled by initializing Visdom with `use_incoming_socket=False`, which will pr
event waiting for this request to timeout.
Script.py:41: UserWarning: To copy construct from a tensor, it is recommended to us
e sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than
torch.tensor(sourceTensor).
params['w'].append(nn.Parameter(torch.tensor(Normal(torch.zeros(n_in, n_out), std * torch.ones(n
_in, n_out)).rsample(), requires_grad=True, device=device)))
Script.py:42: UserWarning: To copy construct from a tensor, it is recommended to us
e sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than
torch.tensor(sourceTensor).
params['b'].append(nn.Parameter(torch.tensor(torch.mul(bias_init, torch.ones([n_out,])), require
s_grad=True, device=device)))
Script.py:292: UserWarning: To copy construct from a tensor, it is recommended to u
se sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather tha
n torch.tensor(sourceTensor).
return torch.exp(torch.lgamma(torch.tensor(a, dtype=torch.float, requires_grad=True).to(device=l
ocal_device)) torch.lgamma(torch.tensor(b, dtype=torch.float, requires_grad=True).to(device=loca
l_device)) - torch.lgamma(torch.tensor(a b, dtype=torch.float, requires_grad=True).to(device=local
_device)))
Script.py:679: UserWarning: This overload of add_ is deprecated:
add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
add_(Tensor other, *, Number alpha) (Triggered internally at /opt/conda/conda-bld/pytorch
_1631630815121/work/torch/csrc/utils/python_arg_parser.cpp:1025.)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
[Errno 110] Connection timed out
on_close() takes 1 positional argument but 3 were given
Traceback (most recent call last):
File "Script.py", line 873, in <module>
update_viz(epoch, elbo2.item(), 'ELBO2 Loss of beta distributions')
File "Script.py", line 736, in update_viz
win = viz.line(
NameError: name 'viz' is not defined
как я могу запустить свой сценарий построения графика на удаленном сервере? Какая командная строка кода python должна быть в моем скрипте SLURM? Как я могу сохранить график и переместить его позже на свой ноутбук с помощью scp
command?
Ответ №1:
Попробуйте использовать global viz
after global win
line .