일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | ||
6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 28 | 29 | 30 |
Tags
- Button
- mean
- mariadb
- GitHub
- DFS
- pycharm
- 삼성소프트웨어멤버십
- CNN
- Lotto
- SPL
- index
- 알고리즘
- synology
- Series
- GT-S80
- Numpy
- dataframe
- RNN
- LSTM
- javascript
- E-P1
- Splunk
- keras
- install
- pip
- ipad
- Python
- SciPy
- imread
- pandas
Archives
- Today
- Total
잠토의 잠망경
[python] extract highlighted text (highlight된 글자 추출) 본문
https://stackoverflow.com/questions/9099497/how-to-extract-highlighted-parts-from-pdf-files
from typing import List, Tuple
import fitz # install with 'pip install pymupdf'
def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
points = annot.vertices
quad_count = int(len(points) / 4)
sentences = []
for i in range(quad_count):
# where the highlighted part is
r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect
words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
sentences.append(" ".join(w[4] for w in words))
sentence = " ".join(sentences)
return sentence
def handle_page(page):
wordlist = page.getText("words") # list of words on page
wordlist.sort(key=lambda w: (w[3], w[0])) # ascending y, then x
highlights = []
annot = page.firstAnnot
while annot:
if annot.type[0] == 8:
highlights.append(_parse_highlight(annot, wordlist))
annot = annot.next
return highlights
def main(filepath: str) -> List:
doc = fitz.open(filepath)
highlights = []
ipage = 1
for page in doc:
highlitedTexts = handle_page(page)
if len(highlitedTexts)>0:
highlights.append([ipage, highlitedTexts])
ipage+=1
return highlights
if __name__ == "__main__":
print(main(r"sample.pdf"))
Comments