Skip to content

Commit ae8b1f3

Browse files
Merge pull request #152 from StarlightSearch/idefics3
Idefics3
2 parents d4bdbed + 077aed6 commit ae8b1f3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3883
-202
lines changed

examples/colpali.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66

77
# Load the model
8-
# model: ColpaliModel = ColpaliModel.from_pretrained("vidore/colpali-v1.2-merged", None)
8+
model: ColpaliModel = ColpaliModel.from_pretrained("vidore/colpali-v1.2-merged", None)
99

1010
# Load ONNX Model
11-
model: ColpaliModel = ColpaliModel.from_pretrained_onnx(
12-
"starlight-ai/colpali-v1.2-merged-onnx", None
13-
)
11+
# model: ColpaliModel = ColpaliModel.from_pretrained_onnx(
12+
# "starlight-ai/colpali-v1.2-merged-onnx", None
13+
# )
1414

1515
# Get all PDF files in the directory
1616
directory = Path("test_files")

processors/src/docx_processor.rs

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
use std::path::Path;
2-
use docx_parser::MarkdownDocument;
3-
use text_splitter::ChunkConfigError;
41
use crate::markdown_processor::MarkdownProcessor;
52
use crate::processor::{Document, DocumentProcessor, FileProcessor};
3+
use docx_parser::MarkdownDocument;
4+
use std::path::Path;
5+
use text_splitter::ChunkConfigError;
66

77
/// A struct for processing PDF files.
88
pub struct DocxProcessor {
@@ -12,9 +12,7 @@ pub struct DocxProcessor {
1212
impl DocxProcessor {
1313
pub fn new(chunk_size: usize, overlap: usize) -> Result<DocxProcessor, ChunkConfigError> {
1414
let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
15-
Ok(DocxProcessor {
16-
markdown_processor,
17-
})
15+
Ok(DocxProcessor { markdown_processor })
1816
}
1917
}
2018

@@ -34,16 +32,20 @@ mod tests {
3432
let txt_file = "../test_files/test.docx";
3533
let processor = DocxProcessor::new(128, 0).unwrap();
3634

37-
let text = processor.process_file(&txt_file).unwrap();
38-
assert!(text.chunks.contains(&"This is a docx file test".to_string()));
35+
let text = processor.process_file(txt_file).unwrap();
36+
assert!(text
37+
.chunks
38+
.contains(&"This is a docx file test".to_string()));
3939
}
4040

4141
// Returns an error if the file path is invalid.
4242
#[test]
43-
#[should_panic(expected = "Error processing file: IO(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })")]
43+
#[should_panic(
44+
expected = "Error processing file: IO(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })"
45+
)]
4446
fn test_extract_text_invalid_file_path() {
4547
let invalid_file_path = "this_file_definitely_does_not_exist.docx";
4648
let processor = DocxProcessor::new(128, 0).unwrap();
47-
processor.process_file(&invalid_file_path).unwrap();
49+
processor.process_file(invalid_file_path).unwrap();
4850
}
4951
}

processors/src/html_processor.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
use crate::markdown_processor::MarkdownProcessor;
2+
use crate::processor::{Document, DocumentProcessor};
13
use anyhow::Result;
24
use htmd::{HtmlToMarkdown, HtmlToMarkdownBuilder};
35
use text_splitter::ChunkConfigError;
4-
use crate::markdown_processor::MarkdownProcessor;
5-
use crate::processor::{Document, DocumentProcessor};
66

77
pub struct HtmlDocument {
88
pub content: String,
@@ -18,8 +18,7 @@ pub struct HtmlProcessor {
1818
impl HtmlProcessor {
1919
pub fn new(chunk_size: usize, overlap: usize) -> Result<HtmlProcessor, ChunkConfigError> {
2020
let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
21-
let html_to_markdown = HtmlToMarkdownBuilder::new()
22-
.build();
21+
let html_to_markdown = HtmlToMarkdownBuilder::new().build();
2322
Ok(HtmlProcessor {
2423
markdown_processor,
2524
html_to_markdown,
@@ -36,8 +35,8 @@ impl DocumentProcessor for HtmlProcessor {
3635

3736
#[cfg(test)]
3837
mod tests {
39-
use crate::processor::FileProcessor;
4038
use super::*;
39+
use crate::processor::FileProcessor;
4140

4241
#[test]
4342
fn test_process_html_file() {
Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,26 @@
1-
use text_splitter::{Characters, ChunkConfig, ChunkConfigError, MarkdownSplitter};
21
use crate::processor::{Document, DocumentProcessor};
2+
use text_splitter::{Characters, ChunkConfig, ChunkConfigError, MarkdownSplitter};
33

44
/// A struct that provides functionality to process Markdown files.
55
pub struct MarkdownProcessor {
6-
splitter: MarkdownSplitter<Characters>
6+
splitter: MarkdownSplitter<Characters>,
77
}
88

99
impl MarkdownProcessor {
1010
pub fn new(chunk_size: usize, overlap: usize) -> Result<MarkdownProcessor, ChunkConfigError> {
11-
let splitter_config = ChunkConfig::new(chunk_size)
12-
.with_overlap(overlap)?;
11+
let splitter_config = ChunkConfig::new(chunk_size).with_overlap(overlap)?;
1312
let splitter = MarkdownSplitter::new(splitter_config);
14-
Ok(MarkdownProcessor {
15-
splitter
16-
})
13+
Ok(MarkdownProcessor { splitter })
1714
}
1815
}
1916

2017
impl DocumentProcessor for MarkdownProcessor {
21-
2218
fn process_document(&self, content: &str) -> anyhow::Result<Document> {
23-
let chunks = self.splitter.chunks(content)
19+
let chunks = self
20+
.splitter
21+
.chunks(content)
2422
.map(|x| x.to_string())
2523
.collect();
26-
Ok(Document {
27-
chunks
28-
})
24+
Ok(Document { chunks })
2925
}
3026
}

processors/src/pdf/tesseract/command.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ use super::*;
44
use std::process::{Command, Stdio};
55
use std::string::ToString;
66

7+
use crate::pdf::tesseract::error::{TessError, TessResult};
78
#[cfg(target_os = "windows")]
89
use std::os::windows::process::CommandExt;
9-
use crate::pdf::tesseract::error::{TessError, TessResult};
1010

1111
#[cfg(target_os = "windows")]
1212
const CREATE_NO_WINDOW: u32 = 0x08000000;

processors/src/pdf/tesseract/input.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1+
use crate::pdf::tesseract::error::{TessError, TessResult};
12
use image::DynamicImage;
23
use std::{
34
collections::HashMap,
45
fmt::{self},
56
path::{Path, PathBuf},
67
};
7-
use crate::pdf::tesseract::error::{TessError, TessResult};
88

99
#[derive(Clone, Debug, PartialEq)]
1010
pub struct Args {
@@ -121,7 +121,10 @@ mod tests {
121121
fn test_from_path() {
122122
let input = Image::from_path("../test_files/clip/cat1.jpg").unwrap();
123123

124-
assert_eq!(input.get_image_path().unwrap(), "../test_files/clip/cat1.jpg")
124+
assert_eq!(
125+
input.get_image_path().unwrap(),
126+
"../test_files/clip/cat1.jpg"
127+
)
125128
}
126129

127130
#[test]

processors/src/pdf/tesseract/output_boxes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
use core::fmt;
21
use crate::pdf::tesseract::error::TessResult;
32
use crate::pdf::tesseract::input::{Args, Image};
43
use crate::pdf::tesseract::parse_line_util::{parse_next, FromLine};
4+
use core::fmt;
55

66
#[derive(Debug, PartialEq)]
77
pub struct BoxOutput {

processors/src/pdf/tesseract/output_config_parameters.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@ impl FromLine for ConfigParameter {
4545
}
4646
}
4747

48-
pub fn get_tesseract_config_parameters(
49-
) -> error::TessResult<ConfigParameterOutput> {
48+
pub fn get_tesseract_config_parameters() -> error::TessResult<ConfigParameterOutput> {
5049
let mut command = command::get_tesseract_command(None);
5150
command.arg("--print-parameters");
5251

@@ -72,7 +71,9 @@ fn string_to_config_parameter_output(
7271

7372
#[cfg(test)]
7473
mod tests {
75-
use crate::pdf::tesseract::output_config_parameters::{string_to_config_parameter_output, ConfigParameter};
74+
use crate::pdf::tesseract::output_config_parameters::{
75+
string_to_config_parameter_output, ConfigParameter,
76+
};
7677

7778
#[test]
7879
fn test_string_to_config_parameter_output() {
@@ -97,7 +98,8 @@ mod tests {
9798
#[test]
9899
fn test_get_tesseract_config_parameters() {
99100
let result =
100-
crate::pdf::tesseract::output_config_parameters::get_tesseract_config_parameters().unwrap();
101+
crate::pdf::tesseract::output_config_parameters::get_tesseract_config_parameters()
102+
.unwrap();
101103
let x = result
102104
.config_parameters
103105
.iter()

processors/src/pdf/tesseract/output_data.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,7 @@ impl FromLine for Data {
7373
}
7474
}
7575

76-
pub fn image_to_data(
77-
image: &Image,
78-
args: &Args,
79-
) -> error::TessResult<DataOutput> {
76+
pub fn image_to_data(image: &Image, args: &Args) -> error::TessResult<DataOutput> {
8077
let mut command = command::create_tesseract_command(image, args)?;
8178
command.arg("tsv");
8279

@@ -112,13 +109,12 @@ mod tests {
112109
top: 41,
113110
width: 46,
114111
height: 20,
115-
conf: 96.063751,
112+
conf: 96.063_75,
116113
text: String::from("The"),
117114
}
118115
)
119116
}
120117

121-
122118
#[test]
123119
fn test_string_to_data_parse_error() {
124120
let result = string_to_data("level page_num block_num par_num line_num word_num left top width height conf text\n\

processors/src/processor.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,32 @@
11
use std::path::Path;
22

33
pub trait DocumentProcessor {
4-
54
fn process_document(&self, content: &str) -> anyhow::Result<Document>;
65
}
76

87
pub trait FileProcessor {
9-
108
fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document>;
119
}
1210

1311
pub trait UrlProcessor {
1412
fn process_url(&self, url: &str) -> anyhow::Result<Document>;
1513
}
1614

17-
impl <T: DocumentProcessor> FileProcessor for T {
15+
impl<T: DocumentProcessor> FileProcessor for T {
1816
fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
1917
let bytes = std::fs::read(path)?;
2018
let out = String::from_utf8_lossy(&bytes);
2119
self.process_document(&out)
2220
}
2321
}
2422

25-
impl <T: DocumentProcessor> UrlProcessor for T {
23+
impl<T: DocumentProcessor> UrlProcessor for T {
2624
fn process_url(&self, url: &str) -> anyhow::Result<Document> {
2725
let content = reqwest::blocking::get(url)?.text()?;
2826
self.process_document(&content)
2927
}
3028
}
3129

3230
pub struct Document {
33-
pub chunks: Vec<String>
31+
pub chunks: Vec<String>,
3432
}

0 commit comments

Comments
 (0)