잠토의 잠망경

[python] extract highlighted text (highlight된 글자 추출) 본문

카테고리 없음

[python] extract highlighted text (highlight된 글자 추출)

잠수함토끼 2021. 8. 28. 23:00

https://stackoverflow.com/questions/9099497/how-to-extract-highlighted-parts-from-pdf-files

from typing import List, Tuple

import fitz  # install with 'pip install pymupdf'


def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
    points = annot.vertices
    quad_count = int(len(points) / 4)
    sentences = []
    for i in range(quad_count):
        # where the highlighted part is
        r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect

        words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
        sentences.append(" ".join(w[4] for w in words))
    sentence = " ".join(sentences)
    return sentence


def handle_page(page):
    wordlist = page.getText("words")  # list of words on page
    wordlist.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x

    highlights = []
    annot = page.firstAnnot
    while annot:
        if annot.type[0] == 8:
            highlights.append(_parse_highlight(annot, wordlist))
        annot = annot.next
    return highlights


def main(filepath: str) -> List:
    doc = fitz.open(filepath)

    highlights = []
    ipage = 1
    for page in doc:

        highlitedTexts = handle_page(page)

        if len(highlitedTexts)>0:
            highlights.append([ipage, highlitedTexts])

        ipage+=1

    return highlights


if __name__ == "__main__":
    print(main(r"sample.pdf"))
Comments