Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion document_qa/document_qa_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from langchain_core.vectorstores import VectorStore
from tqdm import tqdm

from document_qa.grobid_processors import GrobidProcessor
from document_qa.grobid_processors import GrobidProcessor, GrobidServiceError
from document_qa.langchain import ChromaAdvancedRetrieval


Expand Down Expand Up @@ -376,6 +376,8 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
filename = Path(pdf_file_path).stem
coordinates = True # if chunk_size == -1 else False
structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
if not structure:
raise GrobidServiceError("Grobid did not return a response.")
Comment thread
lfoppiano marked this conversation as resolved.

biblio = structure['biblio']
biblio['filename'] = filename.replace(" ", "_")
Expand Down
36 changes: 25 additions & 11 deletions document_qa/grobid_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
from grobid_client.grobid_client import GrobidClient


class GrobidServiceError(RuntimeError):
"""Raised when the Grobid service fails to process a document."""

def __init__(self, message="Grobid service error", status_code=None):
super().__init__(message)
self.status_code = status_code


def get_span_start(type, title=None):
title_ = ' title="' + title + '"' if title is not None else ""
return '<span class="label ' + type + '"' + title_ + '>'
Expand Down Expand Up @@ -97,18 +105,24 @@ def __init__(self, grobid_url, ping_server=True):
self.grobid_client = grobid_client

def process_structure(self, input_path, coordinates=False):
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
try:
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
except Exception as exc:
raise GrobidServiceError("Grobid service did not respond.") from exc
Comment thread
lfoppiano marked this conversation as resolved.

Comment on lines +116 to 120
if status != 200:
return
raise GrobidServiceError(
f"Grobid service returned status {status}.",
status_code=status
)
Comment thread
lfoppiano marked this conversation as resolved.
Comment on lines +108 to +125

document_object = self.parse_grobid_xml(text, coordinates=coordinates)
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
Expand Down Expand Up @@ -137,7 +151,7 @@ def parse_grobid_xml(self, text, coordinates=False):
try:
year = dateparser.parse(doc_biblio.header.date).year
biblio["publication_year"] = year
except:
except Exception:
pass

output_data['biblio'] = biblio
Expand Down
29 changes: 18 additions & 11 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from tempfile import NamedTemporaryFile

import dotenv
import streamlit as st
from grobid_quantities.quantities import QuantitiesAPI
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from streamlit_pdf_viewer import pdf_viewer

from document_qa.custom_embeddings import ModalEmbeddings
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations, GrobidServiceError
from document_qa.ner_client_generic import NERClientGeneric

dotenv.load_dotenv(override=True)

import streamlit as st
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations

API_MODELS = {
"microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
"Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
Expand Down Expand Up @@ -320,13 +319,21 @@ def play_old_messages(container):
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary

st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
try:
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1
)
Comment thread
lfoppiano marked this conversation as resolved.
Outdated
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
except GrobidServiceError as exc:
status = f" (status {exc.status_code})" if exc.status_code else ""
st.session_state['doc_id'] = None
st.session_state['loaded_embeddings'] = False
st.session_state['uploaded'] = False
st.error(f"Grobid is not responding{status}. Please try later.")
Comment thread
lfoppiano marked this conversation as resolved.
Outdated
Comment thread
lfoppiano marked this conversation as resolved.
Outdated
st.stop()


def rgb_to_hex(rgb):
Expand Down