mirror of
https://github.com/tymur999/braintok.git
synced 2025-08-02 23:00:38 +00:00
31 lines
1020 B
Python
31 lines
1020 B
Python
import pdfplumber
|
|
import json
|
|
import json
|
|
import docx
|
|
|
|
|
|
def convert_file(inputFileName, outputFileName):
|
|
fileType = inputFileName.split('.')[1]
|
|
if fileType == "pdf":
|
|
with pdfplumber.open(f"{inputFileName}") as pdf_file:
|
|
content = []
|
|
for page in pdf_file.pages:
|
|
text = page.extract_text()
|
|
content.append(text)
|
|
|
|
elif fileType == "docx":
|
|
with open(f'{inputFileName}', "r") as docx_file:
|
|
doc = docx.Document(docx_file)
|
|
content = []
|
|
for para in doc.paragraphs:
|
|
content.append(para.text)
|
|
|
|
with open(f'{outputFileName}.jsonl', "w") as jsonl_file:
|
|
data = json.dumps(content)
|
|
jsonl_file.write(data)
|
|
print(f"PDF content saved to {outputFileName}.jsonl")
|
|
|
|
|
|
# inputFileName = input("enter input file name with extension (supported types: pdf, docx): ")
|
|
# outputFileName = input("enter output file name without extension:")
|
|
# convert_file(inputFileName, outputFileName) |