feat(server): improve pdf parsing (#12356)

This commit is contained in:
darkskygit 2025-05-27 11:36:47 +00:00
parent 3c0fa429c5
commit 7175019a0a
No known key found for this signature in database
GPG Key ID: 97B7D036B1566E9D
3 changed files with 92 additions and 14 deletions

57
Cargo.lock generated
View File

@ -20,8 +20,7 @@ checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "adobe-cmap-parser"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
source = "git+https://github.com/darkskygit/adobe-cmap-parser#610513ae6035c63eab69f33299b86c43693cabb4"
dependencies = [
"pom",
]
@ -2737,9 +2736,9 @@ dependencies = [
[[package]]
name = "path-ext"
version = "0.1.1"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de7a86239a8b87b5094977b64893fcf0ed768072744dd4ee0df237686b2d815"
checksum = "7603010004b5cdecf8006605bf7b6f07b0e59d3003010f52b767e91bf2582a45"
dependencies = [
"path-slash",
"walkdir",
@ -2754,7 +2753,7 @@ checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42"
[[package]]
name = "pdf-extract"
version = "0.8.2"
source = "git+https://github.com/toeverything/pdf-extract?branch=darksky%2Fimprove-font-decoding#e74beed894e1b8dc228c2bf078ed92814b27759f"
source = "git+https://github.com/toeverything/pdf-extract?branch=darksky%2Fimprove-font-decoding#040751a61aba51e7a28217b758c18db4415c3ee4"
dependencies = [
"adobe-cmap-parser",
"cff-parser",
@ -2763,6 +2762,7 @@ dependencies = [
"log",
"lopdf",
"postscript",
"rust-embed",
"type1-encoding-parser",
"unicode-normalization",
]
@ -2943,9 +2943,12 @@ checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
[[package]]
name = "postscript"
version = "0.14.1"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
checksum = "9a2238e788cf2c9b6edc23b83cf8ccdd4a6380cc9bf0598cc220fac42a55def6"
dependencies = [
"typeface",
]
[[package]]
name = "potential_utf"
@ -3333,6 +3336,40 @@ dependencies = [
"realfft",
]
[[package]]
name = "rust-embed"
version = "8.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "025908b8682a26ba8d12f6f2d66b987584a4a87bc024abc5bbc12553a8cd178a"
dependencies = [
"rust-embed-impl",
"rust-embed-utils",
"walkdir",
]
[[package]]
name = "rust-embed-impl"
version = "8.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6065f1a4392b71819ec1ea1df1120673418bf386f50de1d6f54204d836d4349c"
dependencies = [
"proc-macro2",
"quote",
"rust-embed-utils",
"syn 2.0.101",
"walkdir",
]
[[package]]
name = "rust-embed-utils"
version = "8.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6cc0c81648b20b70c491ff8cce00c1c3b223bb8ed2b5d41f0e54c6c4c0a3594"
dependencies = [
"sha2",
"walkdir",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
@ -4670,6 +4707,12 @@ dependencies = [
"pom",
]
[[package]]
name = "typeface"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4f6b49e025f4dc953a29b83e4f5a905089117d09fa53491015d7678951b8be1"
[[package]]
name = "typenum"
version = "1.18.0"

View File

@ -57,7 +57,7 @@ objc2-foundation = "0.3"
once_cell = "1"
ordered-float = "5"
parking_lot = "0.12"
path-ext = "0.1.1"
path-ext = "0.1.2"
pdf-extract = { git = "https://github.com/toeverything/pdf-extract", branch = "darksky/improve-font-decoding" }
phf = { version = "0.11", features = ["macros"] }
proptest = "1.3"

View File

@ -45,19 +45,29 @@ impl Loader for PdfExtractLoader {
#[cfg(test)]
mod tests {
use std::{fs::read, io::Cursor, path::PathBuf};
use std::{
fs::read,
io::Cursor,
path::{Path, PathBuf},
};
use path_ext::PathExt;
use super::*;
#[test]
fn test_parse_pdf() {
let fixtures = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures");
let buffer = read(fixtures.join("sample.pdf")).unwrap();
fn parse_pdf_content(path: &Path) -> Vec<Document> {
let buffer = read(path).unwrap();
let reader = Cursor::new(buffer);
let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader");
let docs = loader.load().unwrap();
loader.load().unwrap()
}
#[test]
fn test_parse_pdf() {
let fixtures = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures");
let docs = parse_pdf_content(&fixtures.join("sample.pdf"));
assert_eq!(docs.len(), 1);
assert_eq!(
@ -66,4 +76,29 @@ mod tests {
consectetuer a"
);
}
#[test]
#[ignore = "for debugging only"]
fn test_parse_pdf_custom() {
let mut args = std::env::args().collect::<Vec<_>>();
let fixtures = 'path: {
while let Some(path) = args.pop() {
let path = PathBuf::from(path);
if path.is_dir() {
break 'path path;
}
}
panic!("No directory provided");
};
for path in fixtures.walk_iter(|p| p.is_file() && p.ext_str() == "pdf") {
println!("Parsing: {}", path.display());
let docs = parse_pdf_content(&path);
let chunks = docs.len();
let words = docs.iter().map(|d| d.page_content.len()).sum::<usize>();
println!("{}: {} chunks, {} words", path.display(), chunks, words,);
}
}
}