| 일 | 월 | 화 | 수 | 목 | 금 | 토 |
|---|---|---|---|---|---|---|
| 1 | 2 | 3 | 4 | 5 | 6 | |
| 7 | 8 | 9 | 10 | 11 | 12 | 13 |
| 14 | 15 | 16 | 17 | 18 | 19 | 20 |
| 21 | 22 | 23 | 24 | 25 | 26 | 27 |
| 28 | 29 | 30 | 31 |
Tags
- Numpy
- javascript
- Series
- CNN
- Python
- synology
- pandas
- 알고리즘
- install
- ipad
- Splunk
- pip
- 삼성소프트웨어멤버십
- imread
- GitHub
- keras
- index
- SPL
- SciPy
- Button
- LSTM
- RNN
- dataframe
- DFS
- mean
- E-P1
- pycharm
- mariadb
- Lotto
- GT-S80
Archives
- Today
- Total
잠토의 잠망경
[python] extract highlighted text (highlight된 글자 추출) 본문
https://stackoverflow.com/questions/9099497/how-to-extract-highlighted-parts-from-pdf-files
from typing import List, Tuple
import fitz # install with 'pip install pymupdf'
def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
points = annot.vertices
quad_count = int(len(points) / 4)
sentences = []
for i in range(quad_count):
# where the highlighted part is
r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect
words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
sentences.append(" ".join(w[4] for w in words))
sentence = " ".join(sentences)
return sentence
def handle_page(page):
wordlist = page.getText("words") # list of words on page
wordlist.sort(key=lambda w: (w[3], w[0])) # ascending y, then x
highlights = []
annot = page.firstAnnot
while annot:
if annot.type[0] == 8:
highlights.append(_parse_highlight(annot, wordlist))
annot = annot.next
return highlights
def main(filepath: str) -> List:
doc = fitz.open(filepath)
highlights = []
ipage = 1
for page in doc:
highlitedTexts = handle_page(page)
if len(highlitedTexts)>0:
highlights.append([ipage, highlitedTexts])
ipage+=1
return highlights
if __name__ == "__main__":
print(main(r"sample.pdf"))
Comments