이미지 & PDF에서 텍스트 추출하기

차트분석 작성
작성일 2025.02.11 11:50

3,202 조회
목록

<!DOCTYPE html>
<html lang="ko">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF & 이미지 OCR 변환기</title>


<link href="bootstrap@5.3.0/dist/css/bootstrap.min.css"" TARGET="_blank" rel="nofollow">https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<script src="bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"" TARGET="_blank" rel="nofollow">https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>


<script src="https://cdnjs.cloudflare.com/ajax/libs/tesseract.js/4.0.2/tesseract.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.min.js"></script>

<style>
body {
background-color: #f8f9fa;
padding: 30px;
}
.container {
max-width: 600px;
background: white;
padding: 20px;
border-radius: 10px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
.loader {
display: none;
margin: 20px auto;
}
</style>
</head>
<body>

<div class="container text-center">
<h2 class="mb-3"> PDF & 이미지 OCR 변환기</h2>
<p class="text-muted">이미지 또는 PDF를 업로드하면 텍스트를 추출합니다.</p>

<input type="file" id="fileInput" class="form-control mb-3" accept="image/*,application/pdf">
<button class="btn btn-primary w-100" onclick="processFile()">OCR 실행</button>


<div class="loader mt-3">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">처리 중...</span>
</div>
<p class="mt-2">OCR 처리 중...</p>
</div>

<textarea id="outputText" class="form-control mt-3" rows="5" placeholder="추출된 텍스트가 여기에 표시됩니다..." readonly></textarea>

<button class="btn btn-success w-100 mt-2" onclick="copyToClipboard()"> 클립보드 복사</button>
<button class="btn btn-secondary w-100 mt-2" onclick="downloadTextFile()"> TXT 다운로드</button>
</div>

<script>
function processFile() {
const fileInput = document.getElementById("fileInput");
const outputText = document.getElementById("outputText");
const loader = document.querySelector(".loader");

if (fileInput.files.length === 0) {
alert("파일을 업로드하세요.");
return;
}

const file = fileInput.files[0];
const fileType = file.type;

loader.style.display = "block"; // 로딩 표시

if (fileType === "application/pdf") {
processPDF(file);
} else if (fileType.startsWith("image/")) {
processImage(file);
} else {
alert("지원되지 않는 파일 형식입니다. 이미지 또는 PDF를 업로드하세요.");
loader.style.display = "none";
}
}

function processImage(file) {
const reader = new FileReader();

reader.onload = function (event) {
Tesseract.recognize(
event.target.result,
"eng+kor", // 한글 + 영어 + 숫자 지원
{
logger: m => console.log(m),
tessedit_char_whitelist: "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.!?%₩"
}
).then(({ data: { text } }) => {
document.getElementById("outputText").value = text.trim();
document.querySelector(".loader").style.display = "none";

// 파일 입력 필드 초기화 (자동 삭제)
document.getElementById("fileInput").value = "";
}).catch(error => {
console.error("OCR 오류:", error);
document.querySelector(".loader").style.display = "none";
alert("OCR 처리 중 오류가 발생했습니다.");
});
};

reader.readAsDataURL(file);
}

function processPDF(file) {
const reader = new FileReader();

reader.onload = function (event) {
const typedarray = new Uint8Array(event.target.result);

pdfjsLib.getDocument(typedarray).promise.then(pdf => {
let textContent = "";
const numPages = pdf.numPages;

const pagePromises = [];
for (let i = 1; i <= numPages; i++) {
pagePromises.push(
pdf.getPage(i).then(page => {
return page.getTextContent();
}).then(text => {
text.items.forEach(item => {
textContent += item.str + " ";
});
})
);
}

Promise.all(pagePromises).then(() => {
document.getElementById("outputText").value = textContent.trim();
document.querySelector(".loader").style.display = "none";
document.getElementById("fileInput").value = "";
});
}).catch(error => {
console.error("PDF 처리 오류:", error);
document.querySelector(".loader").style.display = "none";
alert("PDF 처리 중 오류가 발생했습니다.");
});
};

reader.readAsArrayBuffer(file);
}

function copyToClipboard() {
const outputText = document.getElementById("outputText");

if (outputText.value.trim() === "") {
alert("복사할 텍스트가 없습니다.");
return;
}

navigator.clipboard.writeText(outputText.value)
.then(() => alert("텍스트가 클립보드에 복사되었습니다!"))
.catch(() => alert("복사 실패! 수동으로 복사해주세요."));
}

function downloadTextFile() {
const outputText = document.getElementById("outputText").value;

if (outputText.trim() === "") {
alert("저장할 텍스트가 없습니다.");
return;
}

const blob = new Blob([outputText], { type: "text/plain" });
const a = document.createElement("a");
a.href = URL.createObjectURL(blob);
a.download = "ocr_result.txt";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
</script>

</body>
</html>