From f9a16837ab0c75c1f3d8b24698303c4b972f6f71 Mon Sep 17 00:00:00 2001 From: "Md. Mosaddek Ali" Date: Sun, 22 Mar 2026 21:18:08 +0600 Subject: [PATCH 1/2] feat(pipelines): integrate GitHub issues ingestion into RAG pipeline Signed-off-by: Md. Mosaddek Ali --- pipelines/kubeflow-pipeline.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pipelines/kubeflow-pipeline.py b/pipelines/kubeflow-pipeline.py index 5d7d618..ea1d786 100644 --- a/pipelines/kubeflow-pipeline.py +++ b/pipelines/kubeflow-pipeline.py @@ -230,7 +230,7 @@ def chunk_and_embed( import re import torch from sentence_transformers import SentenceTransformer - from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_text_splitter import RecursiveCharacterTextSplitter device = 'cuda' if torch.cuda.is_available() else 'cpu' model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device) @@ -420,6 +420,13 @@ def github_rag_pipeline( github_token=github_token ) + issues_task = download_github_issues( + repos="kubeflow/kubeflow,kubeflow/pipelines", + labels="", + state="open", + max_issues_per_repo=50, + github_token=github_token + ) # Chunk and embed the content chunk_task = chunk_and_embed( github_data=download_task.outputs["github_data"], @@ -428,7 +435,14 @@ def github_rag_pipeline( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) - + issues_chunk_task = chunk_and_embed( + github_data=issues_task.outputs["issues_data"], + repo_name="kubeflow-issues", + base_url="https://github.com", + chunk_size=chunk_size, + chunk_overlap=chunk_overlap + ) + issues_chunk_task.after(issues_task) # Store in Milvus store_task = store_milvus( embedded_data=chunk_task.outputs["embedded_data"], From 8c1227b89665f2179b3ec0d892028bafa50d6867 Mon Sep 17 00:00:00 2001 From: "Md. Mosaddek Ali" Date: Mon, 23 Mar 2026 01:47:50 +0600 Subject: [PATCH 2/2] fix: align issues ingestion with pipeline params and correct import Signed-off-by: Md. Mosaddek Ali --- pipelines/kubeflow-pipeline.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pipelines/kubeflow-pipeline.py b/pipelines/kubeflow-pipeline.py index ea1d786..c7434ea 100644 --- a/pipelines/kubeflow-pipeline.py +++ b/pipelines/kubeflow-pipeline.py @@ -230,7 +230,7 @@ def chunk_and_embed( import re import torch from sentence_transformers import SentenceTransformer - from langchain_text_splitter import RecursiveCharacterTextSplitter + from langchain.text_splitter import RecursiveCharacterTextSplitter device = 'cuda' if torch.cuda.is_available() else 'cpu' model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device) @@ -421,20 +421,13 @@ def github_rag_pipeline( ) issues_task = download_github_issues( - repos="kubeflow/kubeflow,kubeflow/pipelines", + repos=f"{repo_owner}/{repo_name}", labels="", state="open", max_issues_per_repo=50, github_token=github_token ) - # Chunk and embed the content - chunk_task = chunk_and_embed( - github_data=download_task.outputs["github_data"], - repo_name=repo_name, - base_url=base_url, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap - ) + issues_chunk_task = chunk_and_embed( github_data=issues_task.outputs["issues_data"], repo_name="kubeflow-issues", @@ -442,7 +435,19 @@ def github_rag_pipeline( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) + issues_chunk_task.after(issues_task) + + # Chunk and embed the content + chunk_task = chunk_and_embed( + github_data=download_task.outputs["github_data"], + repo_name=repo_name, + base_url=base_url, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap + ) + + # Store in Milvus store_task = store_milvus( embedded_data=chunk_task.outputs["embedded_data"],