Поиск текстовой строки с помощью pfdminer не согласован [Python]

ProgramBox

Поиск текстовой строки с помощью pfdminer не согласован [Python]

Post author:admin
Запись опубликована:11 апреля, 2023
Post category:Вопросы по программированию

#python #string #return-value #pdfminer

#python #строка #возвращаемое значение #pdfminer

Вопрос:

У меня вопрос о коде, который получает текстовую строку из файла pdf и возвращает выходные данные в формате .csv

Выходные данные хранятся в Output.csv. Как вы можете видеть, он возвращает значение на стр.27, здесь код работает, а 29, стр. 28 отсутствует. То, что я хочу вернуть, — это текстовая строка на стр. 28, код которой не работает.

Кто-нибудь может сказать мне, что я делаю не так? Во втором коде pdfminer считывает правильный вывод, который необходим.

 import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
  from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  from pdfminer.converter import TextConverter
  from pdfminer.layout import LAParams
  from pdfminer.pdfpage import PDFPage
except ImportError:
  print ("Trying to Install required module: pdfminern")
  os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)


# method 3: object oriented programming
class Program:
    #initialisation (happens when Program() is called for the first time)
    def __init__(self):

        # locations
        # this defines the location of the workspace and directory of the data to process
        self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
        self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
        
        # lookuptable
        # this converts the lookuptable from maximo to a list which can be used for comparison
        self.lu_file = self.ws_loc / "lookuptable.csv"
        with open(self.lu_file, newline='') as f:
            reader = csv.reader(f)
            self.lu_list = list(filter(None,list(reader)))
            self.lu_list = [each[0] for each in self.lu_list]


    def listener(self,q):
        '''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
        # open output.csv in location workspace/data/ and use as 'f'
        with open(self.ws_loc / 'output.csv', 'a') as f:
            
            #start infinite listening loop until 'kill' message is received
            while 1:
                
                # get the message which is first in q (queue)
                m = q.get()
                
                # break loop if message is kill and close file 'output.csv'
                if m == 'kill':
                    f.close()
                    break
                
                # if message is not 'kill' then write message to file and flush file
                f.write(m)
                f.flush()

      
    def worker(self, file, q):
        ''' processes a pdf file given by main() and writes output to q (queue)'''
        
        # init PDF class (this class is used to get pages from the PDF and process pdftext)
        PDF = self.PDF(self.dat_loc,self.lu_list,0)
        
        # get all the pages from PDF: contains pages = [page1, ..., pageN]
        # pageN = "bla bla n bla etc."
        PDFpages = PDF.getPages(file)
        pages = []
        for page in PDFpages:
            pages.append(page)
        
        # varargs defines extra data for files (this is where metadata is stored)
        # varargs should not be filled here, but it is initialized here.
        varargs = ''
        
        # check if file is a manual (this can be seen as an example for a varargs entry)
        # it should contain atleast ',' (this creates a new column entry in the csv)
        # PDF.fileCategory() which is a class within the Program class, can be taken as an example
        varargs = PDF.fileCategory(file,pages)   ','   PDF.fileSupplier(file, pages)   ','   PDF.fileRev(file, pages)
        # new vararg can be added like: varargs = THE_VARARG
        
        # initialise pageNum (which is a page number identifier inside the for loop)
        pageNum = 1
        
        # create an empty datastack (which is the message that will be send to q (queue))
        datastack = ''
        
        # for each page do...
        for page in pages:
            
            '''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
            found_strings, found = PDF.find_tag(page)
            # found_stringsrev, foundrev = PDF.find_rev(page)
            
            # if tags are found, then fix the tags such that they are correct with
            # Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
            # if foundrev:
            #     string = ''
            #     fixedstring = ''
            #     for stringrev in found_stringsrev:
            #         # fill datastack with found tags
            #         datastack  = file   ','   str(pageNum)   ','   string   ','   fixedstring   ','   stringrev   ','   varargs   'n'
                    
            if found:
                for string in found_strings:
                    # if correct, do not change
                    fixedstring = string
                    
                    # check if the tag matches the correct regexpression ('regex' or 're')
                    if re.match('^(d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
                        
                        # else fix the tag
                        fixedstring = self.putStripe(string)
                        
                    # fill datastack with found tags
                    datastack  = file   ','   str(pageNum)   ','   string   ','   fixedstring   varargs   'n'
            
            
            # next page, so pageNum becomes pageNum   1
            pageNum  =1
        
        # if the datastack is empty, we are still interested in the varargs:
        # (so empty tag columns are added)
        if datastack=='':
            datastack = file   ','   ','   ','   varargs   'n'
        
        # put the datastack message inside of the q (queue)
        q.put(datastack)
        
        # terminate the PDF class so that the pdf file is closed in a correct way
        PDF.terminate()
        
        # return (in case the datastack should be printed)
        return datastack
    
    
    def putStripe(self,input):
        '''This function fixes a tag that is not correct'''
        # strip the tag from spaces
        input = re.sub(' ','',input)
        
        # for each string that matches the expression write to words
        words = re.findall('[0-9][A-Za-z] ', input)
        words  = re.findall('[A-Za-z][0-9] ', input)
        
        # for each match inside the tag add a '-' in the second position
        for word in words:
            i = input.find(word) 1
            input = input[:i]   '-'   input[i:]
        
        # return the fixed tag
        return input
    
    def main(self):
        try:
            # initiate time
            t = time.time()
            
            # create pools for paralell pooling (max cpu threads is optained automatically)
            pool = mp.Pool(mp.cpu_count()   2)
            
            # create a manager
            manager = mp.Manager()
            
            # from the pool manager create a queue object which can be used to 
            # exchange data between the worker and listener
            q = manager.Queue()
            
            # start up listener first
            # ignore warning, it is being used
            watcher = pool.apply_async(self.listener, (q,))
            
            # fire off workers (basically assign them jobs)
            jobs = []
            
            # NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
            # AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
            # each file in the data location is a job
            for file in os.listdir(self.dat_loc):
                # assign the job to a worker
                job = pool.apply_async(self.worker, (file, q))
                # append the job to jobs (for data aquisition)
                jobs.append(job)
            
            # this is used to get the data back from jobs
            for job in tqdm(jobs):
                #print('')
                #print(job.get()[:-1])
                job.get()
            
            # printed elapsed time (good for project management)
            print('elapsed time = '   str(time.time()-t)   ' seconds')
        
        # catch interupt and try to properly terminate workers (might take time)
        # best to just do everything in batches and dont interrupt
        except KeyboardInterrupt:
            print("nCaught KeyboardInterrupt, terminating workers")
            q.put('kill') # <-- makes sure the output.csv is always closed properly
            pool.close()
            pool.join()
            pool.terminate()
            SystemExit(1)
        
        # always excecute (kills workers and listener)
        finally:
            q.put('kill') # <-- makes sure the output.csv is always closed properly
            pool.close()
            pool.join()

    def execute(self):
        self.main()

    class PDF:
        # from PDF.
        def __init__(self,dat_loc,lu_list,maxpages):
            self.dat_loc = dat_loc
            self.lu_list = lu_list
            self.lu_list_f = 0
            self.password = ""
            self.maxpages = maxpages
            self.caching = True
            self.rsrcmgr = PDFResourceManager()
            self.retstr = StringIO()
            self.laparams = LAParams()
            self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
            self.pagenos=set()
        
        # from PDF.
        def getPages(self,file):
            self.fp = open(self.dat_loc / file, 'rb')
            pages = PDFPage.get_pages(self.fp,
                                      self.pagenos,
                                      maxpages=self.maxpages,
                                      password=self.password,
                                      caching=self.caching,
                                      check_extractable=True)
            return pages
        
        # from PDF.
        def fileCategory(self,file,pages):
            rules = []
            rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
            rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
            rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
            rules.append(['Spare part list',['SPARE PARTS LIST']])
            rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
            rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
            rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
            rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
            rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
            
            fileCategory = ''

            found = False
            counter = 1
            for page in pages:
                if counter>4:
                    break
                for rule in rules:
                    category = rule[0]
                    category_rules = rule[1]
                    for line in self.pagestr(page).splitlines():
                        if any(line.find(x)!=-1 for x in category_rules):
                            found = True
                        if found:
                            break
                    if found:
                        break
                if found:
                    break
                counter =1
            if found:
                fileCategory  = ','   category
            else:
                fileCategory  = ','   'Unreadable'
            return fileCategory
        
     # from PDF.
        def fileSupplier(self,file,pages):
            rules = []
            rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
            rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
            rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
            rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
            rules.append(['Deltak',['Deltak', 'DELTAK']])
            rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
            
            fileSupplier = ''

            found = False
            counter = 1
            for page in pages:
                if counter>4:
                    break
                for rule in rules:
                    category = rule[0]
                    category_rules = rule[1]
                    for line in self.pagestr(page).splitlines():
                        if any(line.find(x)!=-1 for x in category_rules):
                            found = True
                        if found:
                            break
                    if found:
                        break
                if found:
                    break
                counter =1
            if found:
                fileSupplier  = ','   category
            else:
                fileSupplier  = ','   'Supplier N/A'
            return fileSupplier
        
        # from PDF.
        def fileRev(self,file,pages):            
            fileRev = ''

            found = False
            counter = 1
            for page in pages:
                if counter>4:
                    break
                for line in self.pagestr(page).splitlines():
                        if re.match('^(Rev.*).*$', line):
                            found = True
                        if found:
                            break
                if found:
                    break
                counter =1
            if found:
                fileRev  = ','   line
            else:
                fileRev  = ','   ''
            return fileRev
        
        # from PDF.
        def find_string_lookup(self,page,pageNum,file,varargs):
            datastack = []
            data = []
            found = False
            for line in self.pagestr(page).splitlines():
                line = re.sub('[^A-Za-z0-9] ', '', line)
                counter = 0
                for tag in self.lu_list_f:
                    if line.find(tag)!=-1:
                        found = True
                        data = file   ','   str(self.lu_list[counter][0])   ','   str(pageNum)   varargs  'n'
                        if data not in datastack:
                            datastack  = [data]
                    counter =1
            return datastack, found
        
        # from PDF.
        def find_string(self,page,strings,Method=None):
            
            datastack = []
            data = []
            found = False
            if Method=='ALPHABET_NUM_ONLY':
                tags = [re.sub('[^A-Za-z0-9] ', '', line) for line in strings]
            elif Method=='ALPHABETCAPS_NUM_ONLY':
                tags = [re.sub('[^A-Za-z0-9] ', '', line).upper() for line in strings]
            elif Method=='ALPHABETCAPS':
                tags = [line.upper() for line in strings]
            else:
                tags = strings
            
            for line in self.pagestr(page).splitlines():
                if Method=='ALPHABET_NUM_ONLY':
                    line = re.sub('[^A-Za-z0-9] ', '', line)
                elif Method=='ALPHABETCAPS_NUM_ONLY':
                    line = re.sub('[^A-Za-z0-9] ', '', line).upper()
                elif Method=='ALPHABETCAPS':
                    line = line.upper()
                
                i = 0
                for tag in tags:
                    if tag != '':
                        if line.find(tag)!=-1:
                            found = True
                            data = strings[i]
                            if data not in datastack:
                                datastack  = [data]
                    i =1
            return datastack, found
        
        # from PDF.
        def find_tag(self,page):
            datastack = []
            found = False
            for line in self.pagestr(page).splitlines():                
                tags = re.findall('^(d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}d{4}[ -]{,1}[A-Z]*).*$', line)
                for tag in tags:
                    if tag not in datastack:
                        datastack  = [tag]
                        found = True
            return datastack, found
        
         # from PDF.
        # def find_rev(self,page):
        #     datastack = []
        #     found = False
        #     for line in self.pagestr(page).splitlines():                
        #         tags = re.findall('^(Rev.*).*$', line)
        #         for tag in tags:
        #             if tag not in datastack:
        #                 datastack  = [tag]
        #                 found = True
        #     return datastack, found
        
        
        # from PDF.
        def pagestr(self,page):
            self.retstr.truncate(0)
            self.retstr.seek(0)
            self.interpreter.process_page(page)
            return self.retstr.getvalue()
        
        # from PDF.
        def terminate(self):
            self.fp.close()
            self.device.close()
            self.retstr.close()

# start the code (the proper way)
if __name__ == '__main__':
    Program().execute()

Если я прочитаю pdf с этим кодом на python (также с помощью pdfminer):

 from pathlib import Path
from io import StringIO
try:
  from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  from pdfminer.converter import TextConverter
  from pdfminer.layout import LAParams
  from pdfminer.pdfpage import PDFPage
except ImportError:
  print ("Trying to Install required module: pdfminern")
  os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)

class glb():
    workspace_folder = Path('C:/Users/pco/Desktop/workspace')
    data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
    lookup_file = workspace_folder / "lookuptable.csv"
    with open(lookup_file, newline='') as f:
        reader = csv.reader(f)
        lookup_list = list(reader)
        lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9] ', '', str(line)) for line in lookup_list]))
        
def find_tagnumbers(path):
    pagelines = []
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    page_no = 1
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
        page_str = retstr.getvalue()
        pagelines.append(page_str.splitlines())
        retstr.truncate(0)
        retstr.seek(0)
        page_no  =1
    page_no  =-1
    print(pagelines)
    fp.close()
    device.close()
    retstr.close()
    return 1


find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')

он возвращает 47-AT -0053. Но если я выполню приведенный ниже код, он не вернет значение в выходной файл. Вывод при печати строк страницы

p.s. мои навыки кодирования для начинающих (поэтому я выписываю все шаги)

Вопрос:

Комментарии:

Вам также может понравиться

В каких полях выполняется поиск с помощью Facebook graph search?

Высота «ввода» с помощью «пробела: предварительный перенос» в Firefox Android больше, чем Chrome в Android

Линтер Python в VS-Коде: возникает ошибка, когда метод имеет аннотацию типа, но не возвращает оператор