open ai API, file conversions

2025-08-03 15:20:38 +00:00 · 2024-10-20 12:20:48 -04:00
parent ec35a72ade
commit 3c4f501316
3 changed files with 60 additions and 0 deletions
--- a/fileFormatConverter.py
+++ b/fileFormatConverter.py
@@ -0,0 +1,31 @@
+import pdfplumber
+import json
+import json
+import docx
+
+
+def convert_file(inputFileName, outputFileName):
+    fileType = inputFileName.split('.')[1]
+    if fileType == "pdf":
+        with pdfplumber.open(f"{inputFileName}") as pdf_file:
+            content = []
+            for page in pdf_file.pages:
+                text = page.extract_text()
+                content.append(text)
+
+    elif fileType == "docx":
+        with open(f'{inputFileName}', "r") as docx_file:
+            doc = docx.Document(docx_file)
+            content = []
+            for para in doc.paragraphs:
+                content.append(para.text)
+
+    with open(f'{outputFileName}.jsonl', "w") as jsonl_file:
+        data = json.dumps(content)
+        jsonl_file.write(data)
+        print(f"PDF content saved to {outputFileName}.jsonl")
+
+
+# inputFileName = input("enter input file name with extension (supported types: pdf, docx): ")
+# outputFileName = input("enter output file name without extension:")
+# convert_file(inputFileName, outputFileName)