Как получить не атрибутивный элемент html-тега через beautifulsoup?

#python #html #beautifulsoup

Вопрос:

У меня есть следующий html-код, который мне нужно проанализировать, как получить доступ ко всем элементам window.WIZ_global_data в обычном словаре? Является window.WIZ_global_data ли это также атрибутом тега скрипта?

Я могу получить доступ к script тегу через soup.head.script

 lt;!DOCTYPE htmlgt; lt;html dir="ltr" lang="no"gt;  lt;headgt;  lt;base href="https://consent.google.com/"/gt;  lt;meta content="origin" name="referrer"/gt;  lt;link href="https://consent.google.com/m" rel="canonical"/gt;  lt;meta content="initial-scale=1,minimum-scale=1,maximum-scale=5,width=device-width" name="viewport"/gt;  lt;link href="//www.google.com/favicon.ico" rel="shortcut icon"/gt;  lt;script data-id="_gd" nonce="QI9 XeJ9TwHcCspiFyqIIQ"gt;  window.WIZ_global_data = {"DndLYb":"","DpimGf":false,"EP1ykd":["/_/*"],"FdrFJe":"2318287307032857584","GVlsxf":"www.google.com","Im6cmf":"/_/ConsentUi","LVIXXb":1,"LoQv7e":false,"MT7f9b":[],"Mypbod":"https://www.googleapis.com/reauth","PYFuDc":"DUMMY_X_CLIENT_DATA_WIZ_GLOBAL_KEY_DO_NOT_USE","QrtxK":"","R6pIad":"%.@.]","S06Grb":"","TTHqvb":"https://kidsmanagement-pa.googleapis.com","Yllh3e":"%.@.1637316536530302,170695567,503968613]","cfb2h":"boq_identityfrontenduiserver_20211111.08_p0","eNnkwf":"1637316536","ejMLCd":"DUMMY_X_GEO_WIZ_GLOBAL_KEY_DO_NOT_USE","eptZe":"/_/ConsentUi/","fPDxwd":[1763433,1772879,1782333,45814370],"gGcLoe":false,"ksKYzf":"%.@.false,true,false,false,null,false,null,"",null,[["RelayState","SAMLRequest","SigAlg","Signature","TL","af","alwf","btmpl","c","cbflow","cd","checkConnection","checkedDomains","client_id","continue","cpbps","dsh","emr","faa","flowEntry","flowName","followup","forceOsidOriginForTest","gae","go","hd","hide_status_bar","hl","idvToken","ifkv","ifr","ignoreShadow","kdi","kid_continue","ltmpl","marl","migrate","multilogin","next","oauth","osid","pageId","passwdsession","platform_variant","pstMsg","rart","rip","rm","sarp","scc","scope","secure","sendvemail","service","session","skipShadow","skipvpage","source","ss","ss_mode","sspa","t","target","theme","ul"]],null,null,[],[[null,null,"https://accounts.google.com/AccountChooser?continue\u003dhttps://www.google.com/maps/place/Hauges+gate+66,+3019+Drammen/@59.7476727,10.1924839,17z/data=!3m1!4b1!4m5!3m4!1s0x46412335e0db3d83:0x544c99fb4daa3946!8m2!3d59.7476727!4d10.1946726\u0026hl\u003dno"],null,[null,null,"https://accounts.google.com/signin/recovery?continue\u003dhttps://www.google.com/maps/place/Hauges+gate+66,+3019+Drammen/@59.7476727,10.1924839,17z/data=!3m1!4b1!4m5!3m4!1s0x46412335e0db3d83:0x544c99fb4daa3946!8m2!3d59.7476727!4d10.1946726\u0026hl\u003dno"],[null,null,"https://accounts.google.com/restart?continue\u003dhttps://www.google.com/maps/place/Hauges+gate+66,+3019+Drammen/@59.7476727,10.1924839,17z/data=!3m1!4b1!4m5!3m4!1s0x46412335e0db3d83:0x544c99fb4daa3946!8m2!3d59.7476727!4d10.1946726\u0026hl\u003dno"]],null,false,[]]","nQyAE":{"vEMF5e":"false","LlkYkf":"false","WBBR0d":"true","ViN5Xd":"false","p6p11":"true","EoymAc":"false","FbHgvb":"false","P1ceCf":"false","tBSlob":"false","XqMd3":"false","Ee3RQb":"false","a76Enc":"false","G25Msb":"false","VL5wad":"false","XSgnJf":"false","APYQvd":"false","rONKDd":"false","Dpi3Gf":"false","lttELb":"false","YDrknb":"false","kQrwQd":"false","ihOA2":"false","Ozjmee":"false"},"qwAQke":"ConsentUi","qymVe":"YNnh99mAz0-TXbG5yo-NTKQ-0l4","rtQCxc":-60,"thykhd":"AKH95euIF6P3sAglrRifxgEcfcB9vtI_XXJFaZ5Oty53KFviM2VkkPyRfmncVF-cgLGCK0LYtfquTSFWlA3cjJWXXNjjUICQ5v3brxMomscQEgs9760XyDgPgf0u003d","unNRMb":"AKJVzcpKsW1gHA6-l3WGdp1MQsU_eTjL-Ufc5L6-PjxIndj8D2t6PFy3SRwl7yc0EXpFbqSSCh50","vAyiz":"ChUI/o793Lf0vtdFEK6ek/ubmfGdvgEu003d","w2btAe":"%.@.null,null,"",false,null,null,true,false]","zChJod":"%.@.]"};  lt;/scriptgt;  

Редактировать: Я также могу использовать soup.head.script.text для получения содержимого в виде строки, но хотел бы получить их в виде словаря и более питоническим способом.

Ответ №1:

Вы можете перейти re к поиску переменной, извлекая ее значение, а затем json.loads() получить к ней доступ, как к dictionary :

 json.loads(re.search(r'window.WIZ_global_data = ({.*})', html).group(1))  

Пример

 html=r''' lt;!DOCTYPE htmlgt; lt;html dir="ltr" lang="no"gt;  lt;headgt;  lt;base href="https://consent.google.com/"/gt;  lt;meta content="origin" name="referrer"/gt;  lt;link href="https://consent.google.com/m" rel="canonical"/gt;  lt;meta content="initial-scale=1,minimum-scale=1,maximum-scale=5,width=device-width" name="viewport"/gt;  lt;link href="//www.google.com/favicon.ico" rel="shortcut icon"/gt;  lt;script data-id="_gd" nonce="QI9 XeJ9TwHcCspiFyqIIQ"gt;  window.WIZ_global_data = {"DndLYb":"","DpimGf":false,"EP1ykd":["/_/*"],"FdrFJe":"2318287307032857584","GVlsxf":"www.google.com","Im6cmf":"/_/ConsentUi","LVIXXb":1,"LoQv7e":false,"MT7f9b":[],"Mypbod":"https://www.googleapis.com/reauth","PYFuDc":"DUMMY_X_CLIENT_DATA_WIZ_GLOBAL_KEY_DO_NOT_USE","QrtxK":"","R6pIad":"%.@.]","S06Grb":"","TTHqvb":"https://kidsmanagement-pa.googleapis.com","Yllh3e":"%.@.1637316536530302,170695567,503968613]","cfb2h":"boq_identityfrontenduiserver_20211111.08_p0","eNnkwf":"1637316536","ejMLCd":"DUMMY_X_GEO_WIZ_GLOBAL_KEY_DO_NOT_USE","eptZe":"/_/ConsentUi/","fPDxwd":[1763433,1772879,1782333,45814370],"gGcLoe":false,"ksKYzf":"%.@.false,true,false,false,null,false,null,"",null,[["RelayState","SAMLRequest","SigAlg","Signature","TL","af","alwf","btmpl","c","cbflow","cd","checkConnection","checkedDomains","client_id","continue","cpbps","dsh","emr","faa","flowEntry","flowName","followup","forceOsidOriginForTest","gae","go","hd","hide_status_bar","hl","idvToken","ifkv","ifr","ignoreShadow","kdi","kid_continue","ltmpl","marl","migrate","multilogin","next","oauth","osid","pageId","passwdsession","platform_variant","pstMsg","rart","rip","rm","sarp","scc","scope","secure","sendvemail","service","session","skipShadow","skipvpage","source","ss","ss_mode","sspa","t","target","theme","ul"]],null,null,[],[[null,null,"https://accounts.google.com/AccountChooser?continue\u003dhttps://www.google.com/maps/place/Hauges+gate+66,+3019+Drammen/@59.7476727,10.1924839,17z/data=!3m1!4b1!4m5!3m4!1s0x46412335e0db3d83:0x544c99fb4daa3946!8m2!3d59.7476727!4d10.1946726\u0026hl\u003dno"],null,[null,null,"https://accounts.google.com/signin/recovery?continue\u003dhttps://www.google.com/maps/place/Hauges+gate+66,+3019+Drammen/@59.7476727,10.1924839,17z/data=!3m1!4b1!4m5!3m4!1s0x46412335e0db3d83:0x544c99fb4daa3946!8m2!3d59.7476727!4d10.1946726\u0026hl\u003dno"],[null,null,"https://accounts.google.com/restart?continue\u003dhttps://www.google.com/maps/place/Hauges+gate+66,+3019+Drammen/@59.7476727,10.1924839,17z/data=!3m1!4b1!4m5!3m4!1s0x46412335e0db3d83:0x544c99fb4daa3946!8m2!3d59.7476727!4d10.1946726\u0026hl\u003dno"]],null,false,[]]","nQyAE":{"vEMF5e":"false","LlkYkf":"false","WBBR0d":"true","ViN5Xd":"false","p6p11":"true","EoymAc":"false","FbHgvb":"false","P1ceCf":"false","tBSlob":"false","XqMd3":"false","Ee3RQb":"false","a76Enc":"false","G25Msb":"false","VL5wad":"false","XSgnJf":"false","APYQvd":"false","rONKDd":"false","Dpi3Gf":"false","lttELb":"false","YDrknb":"false","kQrwQd":"false","ihOA2":"false","Ozjmee":"false"},"qwAQke":"ConsentUi","qymVe":"YNnh99mAz0-TXbG5yo-NTKQ-0l4","rtQCxc":-60,"thykhd":"AKH95euIF6P3sAglrRifxgEcfcB9vtI_XXJFaZ5Oty53KFviM2VkkPyRfmncVF-cgLGCK0LYtfquTSFWlA3cjJWXXNjjUICQ5v3brxMomscQEgs9760XyDgPgf0u003d","unNRMb":"AKJVzcpKsW1gHA6-l3WGdp1MQsU_eTjL-Ufc5L6-PjxIndj8D2t6PFy3SRwl7yc0EXpFbqSSCh50","vAyiz":"ChUI/o793Lf0vtdFEK6ek/ubmfGdvgEu003d","w2btAe":"%.@.null,null,"",false,null,null,true,false]","zChJod":"%.@.]"};  lt;/scriptgt; '''   import json,re json.loads(re.search(r'window.WIZ_global_data = ({.*})', html).group(1))