Spaces:

OrganizedProgrammers
/

Docxtract

Sleeping

App Files Files Community

Lucas ARRIESSE commited on Aug 11

Commit

f3bf993

1 Parent(s): 2c86f91

Random fixes + doc bits

Browse files

Files changed (5) hide show

api/docs.py +17 -15
doc/doc.md +6 -0
schemas.py +17 -14
static/index.html +5 -6
static/js/app.js +3 -3

api/docs.py CHANGED Viewed

@@ -22,7 +22,7 @@ from fastapi.responses import StreamingResponse
 from litellm.router import Router
 from kreuzberg import ExtractionConfig, extract_bytes
-from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadRequest, MeetingsRequest, MeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
 # API router for requirement extraction from docs / doc list retrieval / download
 router = APIRouter(tags=["document extraction"])
@@ -108,6 +108,7 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
 # Rate limit of FTP downloads per minute
 FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=120, time_period=60)
 async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
     """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
@@ -241,8 +242,11 @@ async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
 # ============================================= Doc routes =========================================================
-@router.post("/get_meetings", response_model=MeetingsResponse)
-async def get_meetings(req: MeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
     # Extracting WG
     working_group = req.working_group
     tsg = re.sub(r"\d+", "", working_group)
@@ -278,13 +282,13 @@ async def get_meetings(req: MeetingsRequest, http_client: AsyncClient = Depends(
         all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace(
             "-", " ") if meeting.startswith('TSG') else meeting.replace("-", "#") for meeting in meeting_folders]
-    return MeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders)))
 # ============================================================================================================================================
-@router.post("/get_dataframe", response_model=DataResponse)
-async def get_docs_df(req: DataRequest, http_client: AsyncClient = Depends(get_http_client)):
     """
     Downloads the document list dataframe for a given meeting
     """
@@ -314,22 +318,20 @@ async def get_docs_df(req: DataRequest, http_client: AsyncClient = Depends(get_h
     if files == []:
         raise HTTPException(status_code=404, detail="No XLSX has been found")
-    def gen_url(tdoc: str):
-        return f"{url}/{tdoc}.zip"
     df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
     filtered_df = df[~(
         df["Uploaded"].isna())][["TDoc", "Title", "CR category", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
-    filtered_df["URL"] = filtered_df["TDoc"].apply(gen_url)
     df = filtered_df.fillna("")
-    return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
 # ==================================================================================================================================
-@router.post("/download_tdocs")
-async def download_tdocs(req: DocDownloadRequest, http_client: AsyncClient = Depends(get_http_client)):
     """Download the specified TDocs and zips them in a single archive"""
     # Document IDs to download
@@ -379,8 +381,8 @@ class ProgressUpdate(BaseModel):
     processed_docs: int
-@router.post("/generate_requirements/sse")
-async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends(get_llm_router), http_client: AsyncClient = Depends(get_http_client)):
     """Extract requirements from the specified xxxxCR docs using a LLM and returns SSE events about the progress of ongoing operations"""
     documents = req.documents

 from litellm.router import Router
 from kreuzberg import ExtractionConfig, extract_bytes
+from schemas import GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
 # API router for requirement extraction from docs / doc list retrieval / download
 router = APIRouter(tags=["document extraction"])
 # Rate limit of FTP downloads per minute
 FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=120, time_period=60)
 async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
     """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
 # ============================================= Doc routes =========================================================
+@router.post("/get_meetings", response_model=GetMeetingsResponse)
+async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
+    """
+    Retrieves the list of meetings for the given working group.
+    """
     # Extracting WG
     working_group = req.working_group
     tsg = re.sub(r"\d+", "", working_group)
         all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace(
             "-", " ") if meeting.startswith('TSG') else meeting.replace("-", "#") for meeting in meeting_folders]
+    return GetMeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders)))
 # ============================================================================================================================================
+@router.post("/get_meeting_docs", response_model=GetMeetingDocsResponse)
+async def get_meeting_docs(req: GetMeetingDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> GetMeetingDocsResponse:
     """
     Downloads the document list dataframe for a given meeting
     """
     if files == []:
         raise HTTPException(status_code=404, detail="No XLSX has been found")
     df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
     filtered_df = df[~(
         df["Uploaded"].isna())][["TDoc", "Title", "CR category", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
+    filtered_df["URL"] = filtered_df["TDoc"].apply(
+        lambda tdoc: f"{url}/{tdoc}.zip")
     df = filtered_df.fillna("")
+    return GetMeetingDocsResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
 # ==================================================================================================================================
+@router.post("/download_docs")
+async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Depends(get_http_client)) -> StreamingResponse:
     """Download the specified TDocs and zips them in a single archive"""
     # Document IDs to download
     processed_docs: int
+@router.post("/extract_requirements/sse")
+async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_router: Router = Depends(get_llm_router), http_client: AsyncClient = Depends(get_http_client)):
     """Extract requirements from the specified xxxxCR docs using a LLM and returns SSE events about the progress of ongoing operations"""
     documents = req.documents

doc/doc.md CHANGED Viewed

@@ -1,6 +1,12 @@
 ## Reqxtract
 ### General flow
 The general use flow for the project is as follows

 ## Reqxtract
+### API router list
+- `api/docs.py` : Handles documents processing and extraction of requirements
+- `api/requirements.py` : Handles requirements processing and operations.
+- `api/solutions.py` : Handles solution generation and critique.
 ### General flow
 The general use flow for the project is as follows

schemas.py CHANGED Viewed

@@ -2,25 +2,26 @@ from pydantic import BaseModel, Field
 from typing import Any, List, Dict, Literal, Optional
-class MeetingsRequest(BaseModel):
-    working_group: str
-class MeetingsResponse(BaseModel):
-    meetings: Dict[str, str]
-# --------------------------------------
-class DataRequest(BaseModel):
     working_group: str
     meeting: str
-class DataResponse(BaseModel):
     data: List[Dict[Any, Any]]
-# --------------------------------------
 class DocInfo(BaseModel):
     """
@@ -34,6 +35,13 @@ class DocInfo(BaseModel):
     type: str
 class ExtractRequirementsRequest(BaseModel):
     documents: List[DocInfo]
@@ -73,11 +81,6 @@ class ReqSearchResponse(BaseModel):
 # --------------------------------------
-class DocDownloadRequest(BaseModel):
-    documents: List[DocInfo] = Field(
-        description="List of documents to download")
 class ReqGroupingCategory(BaseModel):
     """Represents the category of requirements grouped together"""
     id: int = Field(..., description="ID of the grouping category")

 from typing import Any, List, Dict, Literal, Optional
+# --------------------------------------- Document related endpoints ---------------------------------------
+class GetMeetingsRequest(BaseModel):
+    working_group: Literal["SA1", "SA2", "SA3", "SA4", "SA5", "SA6",
+                           "CT1", "CT2", "CT3", "CT4", "CT5", "CT6", "RAN1", "RAN2"]
+class GetMeetingsResponse(BaseModel):
+    meetings: Dict[str, str] = Field(
+        ..., description="Mapping of meetings with as key their display name and value the FTP meeting name.")
+class GetMeetingDocsRequest(BaseModel):
     working_group: str
     meeting: str
+class GetMeetingDocsResponse(BaseModel):
     data: List[Dict[Any, Any]]
 class DocInfo(BaseModel):
     """
     type: str
+class DownloadDocsRequest(BaseModel):
+    documents: List[DocInfo] = Field(
+        description="List of documents to download")
+# --------------------------------------
 class ExtractRequirementsRequest(BaseModel):
     documents: List[DocInfo]
 # --------------------------------------
 class ReqGroupingCategory(BaseModel):
     """Represents the category of requirements grouped together"""
     id: int = Field(..., description="ID of the grouping category")

static/index.html CHANGED Viewed

@@ -4,7 +4,7 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Requirements Extractor</title>
     <!--See JS imports for ESM modules-->
     <link href="https://cdn.jsdelivr.net/npm/daisyui@5" rel="stylesheet" type="text/css" />
     <script src="https://cdn.jsdelivr.net/npm/@tailwindcss/browser@4"></script>
@@ -22,7 +22,7 @@
     <div class="container mx-auto p-6">
         <div class="relative flex justify-center items-center p-4">
-            <h1 class="text-3xl font-bold">Requirements Extractor</h1>
             <button class="absolute right-4 btn btn-sm btn-circle btn-outline" id="settings-btn"
                 onclick="settings_modal.showModal()" title="Settings">🔧</button>
         </div>
@@ -159,12 +159,12 @@
                     <div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
                         <button id="extract-requirements-btn"
                             class="bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
-                            Extract Requirements
                         </button>
                     </div>
-                    <div class="tooltip" data-tip="Download all selected TDocs as text files">
                         <button id="download-tdocs-btn" class="text-sm rounded px-3 py-1 shadow cursor-pointer">
-                            📦 Download Selected TDocs
                         </button>
                     </div>
                 </div>
@@ -498,7 +498,6 @@
                     </div>
             </div>
         </dialog>
         <script type="module" src="js/app.js"></script>
 </body>

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Docxtract</title>
     <!--See JS imports for ESM modules-->
     <link href="https://cdn.jsdelivr.net/npm/daisyui@5" rel="stylesheet" type="text/css" />
     <script src="https://cdn.jsdelivr.net/npm/@tailwindcss/browser@4"></script>
     <div class="container mx-auto p-6">
         <div class="relative flex justify-center items-center p-4">
+            <h1 class="text-3xl font-bold">Docxtract</h1>
             <button class="absolute right-4 btn btn-sm btn-circle btn-outline" id="settings-btn"
                 onclick="settings_modal.showModal()" title="Settings">🔧</button>
         </div>
                     <div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
                         <button id="extract-requirements-btn"
                             class="bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
+                            Extract Requirements from CRs
                         </button>
                     </div>
+                    <div class="tooltip" data-tip="Download all selected docs as text files">
                         <button id="download-tdocs-btn" class="text-sm rounded px-3 py-1 shadow cursor-pointer">
+                            📦 Download Selected Docs
                         </button>
                     </div>
                 </div>
                     </div>
             </div>
         </dialog>
         <script type="module" src="js/app.js"></script>
 </body>

static/js/app.js CHANGED Viewed

@@ -74,7 +74,7 @@ async function getTDocs() {
     toggleElementsEnabled(['get-tdocs-btn'], false);
     try {
-        const response = await fetch('/docs/get_dataframe', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
             body: JSON.stringify({ working_group: workingGroup, meeting: meeting })
@@ -245,7 +245,7 @@ async function downloadTDocs() {
         // on prend tout
         const documents = selectedData;
-        const response = await fetch('/docs/download_tdocs', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
             body: JSON.stringify({ documents: documents })
@@ -336,7 +336,7 @@ async function extractRequirements() {
     toggleElementsEnabled(['extract-requirements-btn'], false);
     try {
-        const response = await postWithSSE('/docs/generate_requirements/sse', { documents: documents }, {
             onMessage: (msg) => {
                 console.log("SSE message:");
                 console.log(msg);

     toggleElementsEnabled(['get-tdocs-btn'], false);
     try {
+        const response = await fetch('/docs/get_meeting_docs', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
             body: JSON.stringify({ working_group: workingGroup, meeting: meeting })
         // on prend tout
         const documents = selectedData;
+        const response = await fetch('/docs/download_docs', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
             body: JSON.stringify({ documents: documents })
     toggleElementsEnabled(['extract-requirements-btn'], false);
     try {
+        const response = await postWithSSE('/docs/extract_requirements/sse', { documents: documents }, {
             onMessage: (msg) => {
                 console.log("SSE message:");
                 console.log(msg);