Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions document_qa/document_qa_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from langchain_core.vectorstores import VectorStore
from tqdm import tqdm

from document_qa.grobid_processors import GrobidProcessor
from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError
from document_qa.langchain import ChromaAdvancedRetrieval


Expand Down Expand Up @@ -219,7 +219,7 @@ def __init__(self,
self.data_storage = data_storage

if grobid_url:
self.grobid_processor = GrobidProcessor(grobid_url)
self.grobid_processor = GrobidProcessor(grobid_url, ping_server=False)

Comment on lines 222 to 224
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sanakhamassi why was this removed?

def query_document(
self,
Expand Down Expand Up @@ -376,6 +376,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
filename = Path(pdf_file_path).stem
coordinates = True # if chunk_size == -1 else False
structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
if not structure:
raise GrobidServiceError("Grobid did not return a response.")
Comment thread
lfoppiano marked this conversation as resolved.

biblio = structure['biblio']
biblio['filename'] = filename.replace(" ", "_")
Expand Down
36 changes: 25 additions & 11 deletions document_qa/grobid_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
from grobid_client.grobid_client import GrobidClient


class GrobidServiceError(RuntimeError):
"""Raised when the Grobid service fails to process a document."""

def __init__(self, message="Grobid service error", status_code=None):
super().__init__(message)
self.status_code = status_code


def get_span_start(type, title=None):
title_ = ' title="' + title + '"' if title is not None else ""
return '<span class="label ' + type + '"' + title_ + '>'
Expand Down Expand Up @@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True):
self.grobid_client = grobid_client

def process_structure(self, input_path, coordinates=False):
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
try:
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
except Exception as exc:
raise GrobidServiceError("Grobid service did not respond.") from exc
Comment thread
lfoppiano marked this conversation as resolved.

Comment on lines +116 to 120
if status != 200:
return
raise GrobidServiceError(
f"Grobid service returned status {status}.",
status_code=status
)
Comment thread
lfoppiano marked this conversation as resolved.
Comment on lines +108 to +125

document_object = self.parse_grobid_xml(text, coordinates=coordinates)
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
Expand Down Expand Up @@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False):
try:
year = dateparser.parse(doc_biblio.header.date).year
biblio["publication_year"] = year
except:
except Exception:
pass

output_data['biblio'] = biblio
Expand Down
42 changes: 25 additions & 17 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from tempfile import NamedTemporaryFile

import dotenv
import streamlit as st
from grobid_quantities.quantities import QuantitiesAPI
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from streamlit_pdf_viewer import pdf_viewer

from document_qa.custom_embeddings import ModalEmbeddings
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
from document_qa.ner_client_generic import NERClientGeneric

dotenv.load_dotenv(override=True)

import streamlit as st
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations

API_MODELS = {
"microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
"Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
Expand Down Expand Up @@ -314,19 +313,28 @@ def play_old_messages(container):
st.stop()

with left_column:
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
try:
with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
Comment thread
lfoppiano marked this conversation as resolved.
Outdated
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, @Sanakhamassi here you need to either add tempFile in the with () or handle that somehow

st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
except GrobidServiceError as exc:
message = str(exc).strip() or "Grobid is not responding"
status = f" (status {exc.status_code})" if exc.status_code else ""
st.session_state['doc_id'] = None
st.session_state['loaded_embeddings'] = False
st.session_state['uploaded'] = False
st.error(f"{message} Please try later.")
st.stop()


def rgb_to_hex(rgb):
Expand Down