Очистка веб-страниц после входа в систему

#python #web-scraping #beautifulsoup #python-requests

Вопрос:

Я выполняю следующий код для входа по назначенному URL loginUrl -адресу . После аутентификации я хочу перейти на другую веб-страницу, на которой хранится ее URL portfolioUrl -адрес . Однако, когда я print(portfolioPage.content) , он печатает веб-страницу сразу после входа в систему, но не portfolioPage то, что я хочу. Что не так с моим кодом?

 from bs4 import BeautifulSoup
import requests
# create session
session = requests.Session()

loginUrl='https://www.investopedia.com/auth/realms/investopedia/protocol/openid-connect/auth?client_id=inv-simulatoramp;redirect_uri=https://www.investopedia.com/auth/realms/investopedia/shopify-auth/inv-simulator/login?&redirectUrl=https%3A%2F%2Fwww.investopedia.com%2Fauth%2Frealms%2Finvestopedia%2Fprotocol%2Fopenid-connect%2Fauth%3Fresponse_type%3Dcode%26approval_prompt%3Dauto%26redirect_uri%3Dhttps%253A%252F%252Fwww.investopedia.com%252Fsimulator%252Fhome.aspx%26client_id%3Dinv-simulator-confamp;state=7edda3b2-eb6a-441f-8589-b42b8b78accfamp;response_mode=fragmentamp;response_type=codeamp;scope=openidamp;nonce=cd558670-7ae3-4c14-8281-bc149d4987b3'
portfolioUrl = 'https://www.investopedia.com/simulator/trade/tradestock.aspx'

payload = {
    'username': 'my email',
    'password': 'my password'
}
authPage = session.get(loginUrl)
soup = BeautifulSoup(authPage.content, 'html.parser')
form = soup.find('form')
postUrl = form['action']
auth = session.post(postUrl, data=payload)

portfolioPage = session.get(portfolioUrl)
soup = BeautifulSoup(portfolioPage.content, 'html.parser')
print(portfolioPage.content)
 

Редактировать: ответ t4kq работает отлично; однако, когда я print(page.text) , он не выводит HTML-код страницы, как ожидалось, а вместо этого выводит этот код:

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" version="XHTML RDFa 1.0" dir="ltr">
<head profile="http://www.w3.org/1999/xhtml/vocab">
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta name="application-name" content="Investopedia"/>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta http-equiv="X-UA-Compatible" content="IE=9">

    <!-- Page Taxonomy -->
<script type="text/javascript">
//<![CDATA[
  var _pageTaxonomy = {
    "Hashkey": "$simulator$trade$tradestock",
    "Channel": "Simulator",
    "SubChannel": "",
    "Advertising": "Investing",
    "SubAdvertising": "Simulator",
    "AdTarget": "investopedia.com/simulator",
    "DfpTarget": "Investing/Investing",
    "Tags": null,
    "Type": "Simulator",
    "Lucrativeness": null,
    "Timelessness": "Timeless",
    "Feature": "",
    "Design": "",
    "InterestLevel": null,
    "Path" : "/simulator/trade/tradestock.aspx",
  };
//]]>
</script>
<!-- End Page Taxonomy -->
    <script language="javascript" type="text/javascript">var idc_slots = {};
        idc_slots.slots = ["AdSlot_AF-Top-Leaderboard","AdSlot_AF-Left-Multi","AdSlot_BF-Right-Button1","AdSlot_BF-Right-Button2","AdSlot_BF-Right-Button3","AdSlot_BF-Right-Button4"];
        idc_slots.build = function(slot) {
            return "/479/INV-NA/Investing/Investing/position/Simulator".replace("position", slot.position);
        };</script><script type="text/javascript">
            idc_slots.slots.push({
                        "AdSlot_AF-Top-Leaderboard" : {
                            sizeMappings: [
                                {
                                    viewportSize: [1000, 1],
                                    slotSizes: [[728, 90], [970, 90], [950, 90], [960, 90], [970, 66], [980, 90],"fluid"]
                                },
                                {
                                    viewportSize: [700, 1],
                                    slotSizes: [[728, 90], [468, 60]]
                                },
                                {
                                    viewportSize: [400, 1],
                                    slotSizes: [468, 60]
                                },
                                {
                                    viewportSize: [0, 0],
                                    slotSizes: [[320, 50], [320, 100]]
                                }
                            ],
                            amzSizes : {
                                desktop: [[728 ,90]],
                                tablet: [[728 ,90]],
                                phone: [[728 ,90], [320,50]]
                            }
                        }});
                </script>

    <title>Investopedia Stock Simulator - Investopedia Stock Simulator - Trade a Stock</title><meta name="Description" content="Fantasyamp;#x20;stockamp;#x20;marketamp;#x20;gameamp;#x20;thatamp;#x20;simulatesamp;#x20;tradingamp;#x20;stocksamp;#x20;andamp;#x20;options.">
<meta name="viewport" content="widthamp;#x3D;device-width,amp;#x20;initial-scaleamp;#x3D;1">
<meta name="Description" content="Fantasyamp;#x20;stockamp;#x20;marketamp;#x20;gameamp;#x20;thatamp;#x20;simulatesamp;#x20;tradingamp;#x20;stocksamp;#x20;andamp;#x20;options.">
<meta name="viewport" content="widthamp;#x3D;device-width,amp;#x20;initial-scaleamp;#x3D;1">    <link rel="canonical" href="https://www.investopedia.com/simulator/trade/tradestock.aspx" />
    <link href="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;imgamp;#x2F;favicon.ico" rel="shortcutamp;#x20;icon" type="imageamp;#x2F;vnd.microsoft.icon">
<link href="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;destamp;#x2F;cssamp;#x2F;simulator.cssamp;#x3F;vamp;#x3D;202102030915" media="screen" rel="stylesheet" type="textamp;#x2F;css">
<link href="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;imgamp;#x2F;favicon.ico" rel="shortcutamp;#x20;icon" type="imageamp;#x2F;vnd.microsoft.icon"><script language="javascript" type="text/javascript">
    var googletag = googletag || {};
    googletag.cmd = googletag.cmd || [];
</script><script language="javascript" type="text/javascript">
    var sem_pageview = false;
    var sem_ocode = '9999';
    var sem_ldid = '';
    var sem_sh = '';
    function updateSemVariable(query) {
        if (query[1] === undefined) {
            return;
        }
        switch(query[0]) {
            case 'o':
                sem_ocode = query[1];
                break;
            case 'ldid':
                sem_ldid = query[1];
                break;
            case 'sh':
                sem_sh = query[1];
                break;
        }
    }
    function getCookie(cname) {
        var name = cname   "=";
        var ca = document.cookie.split(';');
        for (var i = 0; i < ca.length; i  ) {
            var c = ca[i];
            while (c.charAt(0) == ' ') c = c.substring(1);
            if (c.indexOf(name) == 0) return c.substring(name.length,c.length);
        }
        return "";
    }
    function getSemCookie() {
        var queryStr = getCookie('semuser');
        if (queryStr == "") {
            return;
        }
        sem_pageview = true;
        var queries = queryStr.split("amp;");
        for (var i = 0, l = queries.length; i < l; i  ) {
            var query = queries[i].split('=');
            updateSemVariable(query);
        }
    }
    getSemCookie();
    var updateAup = function(aUp) {
        aUp = aUp.replace("INV-NA", "invsem-serp-ds");
        var utms = null;
        if (typeof getUrlParam === "function") {
            try {
                utms = getUrlParam("utm_source");
            } catch (e) {}
        }
        var aUp_arr = aUp.split("/");
        var last = aUp_arr.pop();
        aUp_arr.push((utms !== null ? utms : "dir")  
            "_"   (typeof sem_ocode !== "undefined" ?
                sem_ocode : 0));
        if (aUp_arr.length > 3) {
            aUp_arr[3] = last;
        }
        return aUp_arr.join("/");
    };
    if (typeof googletag !== "undefined") {
        googletag.cmd.push( function() {
            if ((typeof sem_pageview !== 'undefined') amp;amp; (sem_pageview == true)) {
                var processArgs = function(arguments) {
                    if (typeof arguments === "object") {
                        for (var i = 0; i < arguments.length; i  ) {
                            if (arguments[i].indexOf("479") > -1) {
                                arguments[i] = updateAup(arguments[i]);
                                break;
                            }
                        }
                    }
                    return arguments;
                };
                googletag.defineSlot = (function() {
                    var orig_func = googletag.defineSlot;
                    return function() {
                        return orig_func.apply(this, processArgs(arguments));
                    };
                })();
                googletag.defineOutOfPageSlot = (function() {
                    var orig_func = googletag.defineOutOfPageSlot;
                    return function() {
                        return orig_func.apply(this, processArgs(arguments));
                    };
                })();
            }
        });
    }
</script><script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;simulatoramp;#x2F;jsamp;#x2F;jquery.min.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;jsamp;#x2F;jquery.mcs.min.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;simulatoramp;#x2F;jsamp;#x2F;cookie.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;simulatoramp;#x2F;jsamp;#x2F;cookiemix.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;simulatoramp;#x2F;jsamp;#x2F;g.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;simulatoramp;#x2F;jsamp;#x2F;microsoftAjax.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;publicamp;#x2F;simulatoramp;#x2F;jsamp;#x2F;microsoftAjaxWebForms.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;simulator_uiamp;#x2F;jsamp;#x2F;ScrollingTicker.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;cdn.jsdelivr.netamp;#x2F;npmamp;#x2F;promise-polyfillamp;#x40;7amp;#x2F;distamp;#x2F;polyfill.min.js"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;destamp;#x2F;jsamp;#x2F;inv.min.jsamp;#x3F;vamp;#x3D;202102030915"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;distamp;#x2F;simulator.min.js"></script>
<script type="textamp;#x2F;javascript" src="httpsamp;#x3A;amp;#x2F;amp;#x2F;i.investopedia.comamp;#x2F;distamp;#x2F;gdpr.min.jsamp;#x3F;vamp;#x3D;202102030915"></script>   

<script type="text/javascript">
eval(function(p,a,c,k,e,d){e=function(c){return c.toString(36)};if(!''.replace(/^/,String)){while(c--){d[c.toString(a)]=k[c]||c.toString(a)}k=[function(e){return d[e]}];e=function(){return'\w '};c=1};while(c--){if(k[c]){p=p.replace(new RegExp('\b' e(c) '\b','g'),k[c])}}return p}('7 2(9){o d(9)}a 0={4:'',3:'e',6:'',5:''};a 8=f.c({h:2('i='),1:{g:2('j='),k:2('m')}});8.n(7(1){0.4=1['4']||0.4;0.3=1['3']||0.3;0.6=1['b']||0.6;0.5=1['l']||0.5});',25,25,'geoData|data|decode|country_code|city|FIN_zip|FIN_state|function|jqXHR|encoded|var|region_code|ajax|atob|FR|jQuery|access_key|url|aHR0cHM6Ly9hcGkuaXBzdGFjay5jb20vY2hlY2s|MTBlZjJlYjI2NzFhNjQ5MTQ5NDk1ODZjMzExMDdiYWQ|fields|zip|Y2l0eSxjb3VudHJ5X2NvZGUscmVnaW9uX2NvZGUsemlw|done|return'.split('|'),0,{}))
</script>

    <script type="text/javascript">
        (function(d) {
            var e = d.createElement('script');
            e.src = d.location.protocol   '//tag.bounceexchange.com/2320/i.js';
            e.async = true;
            d.getElementsByTagName("head")[0].appendChild(e);
        }(document));
    </script>
</head>

<!--shift_source: 4824cfbe9ef0-->
<body class="simulator-page" onunload="SaveTickerPos();">
<div style="display: none;">
    <!-- Start of DoubleClick Spotlight Tag: Please do not remove -->
    <!-- Activity Name for this tag is:IP Simulator -->
    <!-- Web site URL where tag should be placed: http://www.investopedia.com/simulator -->
    <!-- This tag must be placed within the opening <body> tag, as close to the beginning of it as possible -->
    <!-- Creation Date: Thu Jul 02 17:02:35 EDT 2009 -->
    <script language="JavaScript">
        function SaveTickerPos()
        {
            try
            {
                for (var obj in allTickers){
                    allTickers[obj].paused = true;
                    jQuery.cookie(allTickers[obj].cookieName, allTickers[obj].x, {path: '/'});
                }
            }
            catch(e){}
        }

        var axel = Math.random()   "";
        var a = axel * 10000000000000;
        document.write('<img src="https://ad.doubleclick.net/activity;src=2359949;type=ips;cat=ips;ord=1;num='   a   '?" width=1 height=1 border=0>');
    </script>
    <noscript>
        <img src="https://ad.doubleclick.net/activity;src=2359949;type=ips;cat=ips;ord=1;num=1?" width=1 height=1 border=0>
    </noscript>
    <!-- End of DoubleClick Spotlight Tag: Please do not remove -->

    <!-- Begin comScore Tag -->
    <script type="text/javascript" language="javascript">
        var _comscore = _comscore || [];
        _comscore.push({ c1: "2", c2: "18280457", c4: "https://www.investopedia.com/simulator/trade/tradestock.aspx" });
        (function() {
            var s = document.createElement("script"), el = document.getElementsByTagName("script")[0]; s.async = true;
            s.src = (document.location.protocol == "https:" ? "https://sb" : "http://b")   ".scorecardresearch.com/beacon.js";
            el.parentNode.insertBefore(s, el);
        })();
    </script>
    <noscript>
        <img src="https://sb.scorecardresearch.com/p?c1=2amp;c2=18280457amp;c4=https://www.investopedia.com/simulator/trade/tradestock.aspxamp;cv=2.0amp;cj=1" />
    </noscript>
    <!-- End comScore Tag -->
</div>
<script type='text/javascript' language="JavaScript">
    //<![CDATA[
    if (getCookie('freenewsletterreg') == null) {
        setCookie("freenewsletterreg", "ad", 30);
    }
    var user_info = $.parseJSON(decodeURIComponent(getCookie('user_info')).replace(/ /g, ' '));
    //]]>
</script>


<!--<script type='text/javascript' src="https://www.investopedia.com/simulator/Common/VcidScript.ashx?u=e3bfd87f21d741578241089c9aa5f4c8"></script>-->
<!-- Google Tag Manager -->
<noscript>
  <iframe src="//www.googletagmanager.com/ns.html?id=GTM-5V3WHJ"
        height="0" width="0"
        style="display:none;visibility:hidden"></iframe>
</noscript>
<script>(function (w, d, s, l, i) {
    w[l] = w[l] || [];
    w[l].push({'gtm.start': new Date().getTime(), event: 'gtm.js'});
    var f = d.getElementsByTagName(s)[0],
        j = d.createElement(s), dl = l != 'dataLayer' ? 'amp;l='   l : '';
    j.async = true;
    j.src =
        '//www.googletagmanager.com/gtm.js?id='   i   dl;
    f.parentNode.insertBefore(j, f);
})(window, document, 'script', 'dataLayer', 'GTM-5V3WHJ');</script>
<!-- End Google Tag Manager -->

<script type="text/javascript">
    dataLayer.push(_pageTaxonomy);
    var pageviewID = genPageviewId();
    dataLayer.push({'pageviewID' : pageviewID});
</script>

<!-- ================================= Header ================================= -->
<div id="Header">
    <div class="mid">
        <div class="brand clear layout-size">
            <a href="//index.investopedia.com/"><div class="m-search-icon"><i></i></div></a>
            <div class="logo-container">
                <a href="/" class="logo"></a>
                <div class="button-container">
                    <a class="button view-markets-btn inv-ga-link-tracking" href="/markets/" target="_blank" data-ga-label="blue-markets-cta">      
                        View Markets
                    </a>
                </div>
            </div>
            <div id="ctl00_AdLeaderBoard1_cgiAdTopLeaderboard" class="leader">
                                    <div id='AdSlot_AF-Top-Leaderboard' adonis-marker></div>
                            </div>
        </div>
    </div>
</div>
<!-- ================================= Header //End ================================= -->


<!-- ================================= Content ================================= -->
<div id="Content" class="full">
    <!-- ================================= Left Navigation ================================= -->

    <div class="left-nav">
                    <div class="label">
                Trade            </div>
            <ul>
                                    <li class="">
                                                    <span></span>
                                                <a href="https://www.i
 

Ответ №1:

Я не думаю, что вы размещаете свои данные правильно и не держите сеанс открытым после входа в систему. Попробуй это…

 #using requests.Session() to close session automatically once done
with requests.Session() as login_request: 
    payload = {
        'username': 'my email',
        'password': 'my password'
        }
    login_request.post(loginUrl, data=payload)

#while logged in get the content of the portfolioUrl variable
source_code = login_request.get(portfolioUrl).content 

#after this you can use soup to parse the source_code
soup = BeautifulSoup(source_code, 'html.parser')

print(soup) #to check if it's printing the logged in data
 

Комментарии:

1. Это просто выводит страницу входа в систему.

Ответ №2:

Вы можете попробовать это

 import requests
from bs4 import BeautifulSoup

# create session
session = requests.Session()

url = 'https://investopedia.com/simulator/portfolio/'

payload = {
    'username': 'your_email',
    'password': 'your_password'
}

# get log in page
auth_page = session.get(url)
soup = BeautifulSoup(auth_page.content, 'html.parser')

# get form
form = soup.find('form')

# get post url
post_url = form['action']

# auth
session.post(post_url, data=payload)

# parse content
content_url = 'https://investopedia.com/simulator/trade/tradestock.aspx'
page = session.get(content_url)
page_soup = BeautifulSoup(page.content, 'html.parser')

# simulate page
sim_page = page_soup.find('div', {'class': 'sim-page'})
table = sim_page.find_all('table', {'class': 'table2'})[1]
rows = table.find_all('tr')

for row in rows:
    print(row.find('th').text)
    print(row.find('td').text)
    print('----')
 
 Value (USD)
$10,000.00
----
Buying Power
$10,000.00
----
Cash
$10,000.00
----
 

Комментарии:

1. Я знаю, что эта тема немного устарела, но у меня есть вопрос, если можно. Когда я печатаю(страница.текст), он не выводит ожидаемый HTML-код страницы, но вместо этого выводит другой странный код, который вы найдете в отредактированном сообщении. Почему это происходит?

2. @bassel2777 я не могу понять, что ты имеешь в виду

3. Я имею в виду, что я хочу напечатать HTML-код страницы, используя print(page.text) который должен печатать HTML-код страницы, но это не так!

4. Это дает странный результат, который я привел в посте выше.

5. @bassel2777 значит, мой ответ тебя не устраивает?