Как вы можете соединить код Django с Python?

#python #django

#python #django

Вопрос:

Я довольно новичок в Django и Python. В настоящее время я создаю поисковый робот, который ищет электронные письма, и в настоящее время у меня есть сборка сканера с Python. Но я хочу, чтобы он был связан с панелью поиска в Django, чтобы при нажатии на нее код определял URL-адрес, и код Python запускался. Я не знаю, как это сделать.

На данный момент я попытался поместить код в views.py и подключите его к url.py и подключение этого URL-адреса к кнопке. Но это не работает.

Вот код сканера:

 import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv


class EmailCrawler:

    processed_urls = set()
    unprocessed_urls = set()
    emails = set()

    def __init__(self, website: str):
        self.website = website
        self.unprocessed_urls.add(website)
        self.headers = {
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
        }
        self.base_url = urlsplit(self.website).netloc
        self.outputfile = self.base_url.replace('.','_') '.csv'
        # we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
        # for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
        self.garbage_extensions = ['.aif','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd']
        self.email_count = 0

    def crawl(self):
        """
        It will continue crawling until the list unprocessed urls list is empty
        """

        url = self.unprocessed_urls.pop()
        print("CRAWL : {}".format(url))
        self.parse_url(url)


        if len(self.unprocessed_urls)!=0:
            self.crawl()
        else:
            print('End of crawling for {} '.format(self.website))
            print('Total urls visited {}'.format(len(self.processed_urls)))
            print('Total Emails found {}'.format(self.email_count))
            print('Dumping processed urls to {}'.format(self.base_url.replace('.','_') '.txt'))
            with open(self.base_url.replace('.','_') '.txt' ,'w') as f:
                f.write('n'.join(self.processed_urls))

    def parse_url(self, current_url: str):
        """
        It will load and parse a given url. Load it and find all the url in this page.
        It also filters the urls and adds them to unprocessed url list.
        Finally it scrapes the emails if found on the page and the updates the email list

        INPUT:
            current_url: URL to parse
        RETURN:
            None
        """

        #we will retry to visit a url for 5 times in case it fails. after that we will skip it in case if it still fails to load
        response = requests.get(current_url, headers=self.headers)
        tree = html.fromstring(response.content)
        urls = tree.xpath('//a/@href')  # getting all urls in the page


        #Here we will make sure that we convert the sub domain to full urls
        # example --> /about.html--> https://www.website.com/about.html
        urls = [urljoin(self.website,url) for url in urls]
        # now lets make sure that we only include the urls that fall under our domain i.e filtering urls that point outside our main website.
        urls = [url for url in urls if self.base_url == urlsplit(url).netloc]


        #removing duplicates
        urls = list(set(urls))


        #filtering  urls that point to files such as images, videos and other as listed on garbage_extensions
        #Here will loop through all the urls and skip them if they contain one of the extension
        parsed_url = []
        for url in urls:
            skip = False
            for extension in self.garbage_extensions:
                if not url.endswith(extension) and  not url.endswith(extension '/'):
                    pass
                else:
                    skip = True
                    break
            if not skip:
                parsed_url.append(url)

        # finally filtering urls that are already in queue or already visited
        for url in parsed_url:
            if url not in self.processed_urls and url not in self.unprocessed_urls:
                self.unprocessed_urls.add(url)


        #parsing email
        self.parse_emails(response.text)
        # adding the current url to processed list
        self.processed_urls.add(current_url)



    def parse_emails(self, text: str):
        """
        It scans the given texts to find email address and then writes them to csv
        Input:
            text: text to parse emails from
        Returns:
            bool: True or false (True if email was found on page)
        """
        # parsing emails and then saving to csv
        emails = set(re.findall(r'[a-zA-Z0-9_. -] @[a-zA-Z0-9-] .[a-zA-Z0-9-.] ', text, re.I))
        #TODO: sometime "gFJS3amhZEg_z39D5EErVg@2x.png" gets accepted as email with the above regex. so for now I will check if email ends with jpeg,png and jpg

        for email in emails:
            skip_email = False
            for checker in ['jpg','jpeg','png']:
                if email.endswith(checker):
                    skip_email = True
                    break

            if not skip_email:
                if email not in self.emails:
                    with open(self.outputfile, 'a', newline='') as csvf:
                        csv_writer = csv.writer(csvf)
                        csv_writer.writerow([email])
                    self.email_count  =1
                    self.emails.add(email)
                    print(' {} Email found {}'.format(self.email_count,email))

        if len(emails)!=0:
            return True
        else:
            return False



print('WELCOME TO EMAIL CRAWLER')
try:
    website = sys.argv[1]
except:
    website = input("Please enter a website to crawl for emails:")
crawl = EmailCrawler(website)
crawl.crawl()
 

Вот html-код:

 <html lang="en" dir="ltr">
  <head>
    <meta charset="utf-8">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA 058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
    <title>Email Tracker</title>
    <link rel="stylesheet" type="text/css" href="{% static '/css/main.css' %}">
    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.1.0/css/all.css" integrity="sha384-lKuwvrZot6UHsBSfcMvOkWwlCMgc0TaWr 30HWe3a4ltaBwTZhyTEggF5tJv8tbt" crossorigin="anonymous">
  </head>

  <!-- Search form -->
  <div class="container">
      <br/>
    <div class="row justify-content-center">
                          <div class="col-12 col-md-10 col-lg-8">
                              <form class="card card-sm" action="Search/">
                                  <div class="card-body row no-gutters align-items-center">
                                      <div class="col-auto">
                                          <i class="fas fa-search h4 text-body"></i>
                                      </div>
                                      <!--end of col-->
                                      <div class="col">
                                          <input class="form-control form-control-lg form-control-borderless" type="search" placeholder="Search topics or keywords">
                                      </div>
                                      <!--end of col-->
                                      <div class="col-auto">
                                          <button class="btn btn-lg btn-success" type="submit" onclick="">Search</button>
                                      </div>
                                      <!--end of col-->
                                  </div>
                              </form>
                          </div>
                          <!--end of col-->
                      </div>


  </div>
</html>
 

Любая помощь была бы очень признательна!

Комментарии:

1. Можете ли вы опубликовать свое мнение и urls.py тоже пожалуйста?

2. from django.shortcuts import render from rest_framework.views import APIView from rest_framework.response import Response from rest_framework import generics, status def index(request): return render(request, 'leadfinderapp/main.html') Выше приведены представления from django.contrib import admin from django.urls import path from . import views urlpatterns = [ path('', views.index, name='index'), ] , здесь указаны URL-адреса

Ответ №1:

В вашем html-коде говорится, что всякий раз, когда вы отправляете форму, она попадает на URL, указанный в action, который в вашем случае является ‘Search /’. И у вас нет никаких URL и представлений, написанных для этого. Также всякий раз, когда вы пишете search view, попробуйте вызвать методы в классе EmailCrawler, тогда это сработает.

Комментарии:

1. привет! Спасибо за ответ! Но вот пара вещей, которые я не понимаю. Что вы подразумеваете под вызовом методов в EmailCrawler?