#python #list #filter #append
Вопрос:
Я хочу добавить только проанализированные строки, которые имеют меньшее coord
значение, чем 780
.
directory = 'C:/Users/'
data = []
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
fake_file_handle = io.StringIO()
with open(os.path.join(directory, file), 'rb') as fh:
resource_manager = PDFResourceManager()
laparams = LAParams(line_margin = 0.6)
device = PDFPageAggregator(resource_manager, laparams = laparams)
page_interpreter = PDFPageInterpreter(resource_manager, device)
sizes = []
fonts = []
positions = []
raw_text = []
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextBoxHorizontal):
coord, word = int(lobj.bbox[1]), lobj.get_text().strip()
raw_text.append([coord, word])
for text_line in lobj:
for character in text_line:
if isinstance(character, LTChar):
Font_size = character.size
Font_name = character.fontname[7:]
if character.matrix[0]>0 :
position = character.bbox # font-positon
sizes.append(Font_size)
fonts.append(Font_name)
positions.append(position)
# filter those elements below coord=780.000 coordinate
font_pos = []
maxFontpos = int(780)
for coord, word in enumerate(raw_text):
if coord <= maxFontpos:
font_pos.append(word)
else:
pass
#strings = []
#for i in font_pos:
# strings.append(word[i])
data.append([Font_size, Font_name, font_pos])
converter.close()
fake_file_handle.close()
df1 = pd.DataFrame(data, columns =['Font_size', 'Font_name', 'text'])
дает мне этот список списков, в котором все еще есть символы с координатами больше 780, а именно 788
: 791
print(df1.iloc[0]['text'])
[[778, 'text i want to keep'],
[776, 'text i want to keep'],
[546, 'text i want to keep'],
[510, 'text i want to keep'],
[174, 'text i want to keep'],
[79, 'text i want to keep'],
[48, 'text i want to keep'],
[34, 'text i want to keep'],
[788, 'text i dont want to keep'],
[791, 'text i dont want to keep'],
[735, 'text i want to keep'],
[675, 'text i want to keep'],
[651, 'text i want to keep']]
Вопрос: Почему этот фрагмент кода
font_pos = []
maxFontpos = int(780)
for coord, word in enumerate(raw_text):
if coord <= maxFontpos:
font_pos.append(word)
else:
pass
data.append([Font_size, Font_name, font_pos])
не работает так, как задумывалось?
Ответ №1:
Не используйте enumerate
, попробуйте это вживую:
font_pos = []
maxFontpos = 780
for coord, word in raw_text:
if coord <= maxFontpos:
font_pos.append(word)
else:
pass