Spaces:

Krish30
/

final-yr-cv-shortlisting-rag-ai

Sleeping

App Files Files Community

Krish30 commited on Jan 24

Commit

bad80ad

verified ·

1 Parent(s): 019ddba

Upload 3 files

Browse files

Files changed (3) hide show

cv_app.py +220 -0
requirements.txt +14 -0
vectorize_documents.py +45 -0

cv_app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+from datetime import datetime
+import streamlit as st
+import google.generativeai as genai
+import PyPDF2 as pdf
+from fpdf import FPDF
+from dotenv import load_dotenv
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+# Load environment variables
+load_dotenv()
+# Configure Generative AI API
+genai.configure(api_key=("AIzaSyDv1VwMMrrUHCnWCU16PkN8idcDpXVFqyY"))
+# Initialize vectorstore
+@st.cache_resource
+def setup_vectorstore():
+    embeddings = HuggingFaceEmbeddings()
+    vectorstore = Chroma(persist_directory="cv_vectordb", embedding_function=embeddings)
+    return vectorstore
+# Convert PDF to text
+def input_pdf_text(uploaded_file):
+    reader = pdf.PdfReader(uploaded_file)
+    text = ""
+    for page in range(len(reader.pages)):
+        page = reader.pages[page]
+        text += str(page.extract_text())
+    return text
+# Retrieve relevant content from vectorstore
+def retrieve_from_vectorstore(vectorstore, query):
+    retriever = vectorstore.as_retriever()
+    results = retriever.invoke(query)
+    return "\n".join([doc.page_content for doc in results])
+# Get response from Generative AI
+def get_gemini_response(prompt):
+    model = genai.GenerativeModel('gemini-pro')
+    response = model.generate_content(prompt)
+    return response.candidates[0].content.parts[0].text if response else None
+def generate_pdf_report(candidate_name, report_content):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    pdf.cell(0, 8, txt=f"Candidate Report: {candidate_name}", ln=True, align="L")
+    pdf.ln(5)  # Add slight spacing after the title
+    # Define numbered sections
+    numbered_sections = {
+        1: "Candidate Name and Email",
+        2: '"Can Do" list:',
+        3: '"Should Do" list',
+        4: "Skill Comparison Table:",
+        5: "Overall Matching Score:",
+        6: "Analysis of Strengths and Weaknesses",
+        7: "Recommendations for Improvement",
+        8: "Conclusion on Fitment",
+    }
+    # Parse report content
+    lines = report_content.splitlines()
+    current_section = None
+    bullet_point = "\u2022 "  # Unicode for a bullet point
+    for line in lines:
+        stripped_line = line.strip().replace("*", "")  # Remove all asterisks
+        # Check if line matches a section header
+        if stripped_line in numbered_sections.values():
+            for number, section in numbered_sections.items():
+                if stripped_line == section:
+                    current_section = number
+                    pdf.set_font("Arial", style="", size=11)
+                    pdf.cell(0, 6, txt=f"{number}. {section}", ln=True, align="L")
+                    pdf.ln(3)  # Reduced spacing after each section header
+                    break
+        # Check for sub-content that starts with "-"
+        elif current_section and stripped_line.startswith("- "):
+            pdf.set_font("Arial", size=10)
+            pdf.cell(5)  # Add slight indentation
+            pdf.cell(0, 5, txt=f"{bullet_point}{stripped_line[2:]}", ln=True)
+        # Handle table rows
+        elif "|" in stripped_line:
+            cells = [cell.strip() for cell in stripped_line.split("|")[1:-1]]
+            if len(cells) == 4:
+                pdf.set_font("Arial", size=9)
+                pdf.cell(50, 6, cells[0], border=1)
+                pdf.cell(35, 6, cells[1], border=1, align="C")
+                pdf.cell(35, 6, cells[2], border=1, align="C")
+                pdf.cell(35, 6, cells[3], border=1, align="C")
+                pdf.ln()
+        # Add regular content as plain text
+        else:
+            pdf.set_font("Arial", size=10)
+            pdf.multi_cell(0, 5, stripped_line)
+    # Save the report as a PDF file
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    file_name = f"{candidate_name}_report_{timestamp}.pdf"
+    pdf.output(file_name)
+    return file_name
+# Streamlit UI
+st.title("AI-Powered Candidate Shortlisting")
+st.markdown("Analyze resumes, job descriptions, and match with company culture using RAG.")
+# Setup vectorstore
+vectorstore = setup_vectorstore()
+# File upload
+uploaded_resumes = st.file_uploader("Upload Resumes (PDFs)", type="pdf", accept_multiple_files=True)
+uploaded_job_description = st.file_uploader("Upload Job Description (PDF)", type="pdf")
+if st.button("Generate Fitment Reports"):
+    if not uploaded_resumes or not uploaded_job_description:
+        st.error("Please upload resumes and a job description.")
+    else:
+        with st.spinner("Processing..."):
+            try:
+                # Convert job description to text
+                job_description_text = input_pdf_text(uploaded_job_description)
+                # Retrieve relevant content from vectorstore
+                company_culture_content = retrieve_from_vectorstore(vectorstore, "company culture match")
+                # Process each resume
+                fitment_results = []
+                for resume_file in uploaded_resumes:
+                    # Extract candidate name
+                    candidate_name = os.path.splitext(resume_file.name)[0]
+                    # Convert resume to text
+                    resume_text = input_pdf_text(resume_file)
+                    # Construct the prompt
+                    input_prompt = f"""
+### Task: Generate a candidate shortlisting report.
+### Instructions:
+You are a highly intelligent and unbiased system designed to shortlist candidates for a job based on:
+1. The candidate's resume.
+2. A provided job description.
+3. Relevant company culture data retrieved from the vector database.
+### Key Objectives:
+- Analyze skills, qualifications, and experiences in the resume.
+- Evaluate alignment with the job description.
+- Assess cultural fit using company culture data.
+- Provide detailed scoring, strengths, weaknesses, and recommendations.
+### Required Sections in the Report:
+- Candidate Name and Email
+- Parse the job description and create a 'Should Do' list, categorizing required skills into levels: Beginner, Competent, Intermediate, Expert.
+- Parse the candidate's resume and create a 'Can Do' list, categorizing listed skills into the same levels: Beginner, Competent, Intermediate, Expert.
+- Matching score: A detailed table showing alignment of skills.
+- Analysis of strengths and weaknesses.
+- Recommendations for improvement.
+- Overall conclusion.
+### Input Data:
+- **Resume**: {resume_text}
+- **Job Description**: {job_description_text}
+- **Company Culture Data**: {company_culture_content}
+### Output Format:
+1. Candidate Name and Email
+2."Can Do" list:
+3. "Should Do" list
+4. Skill Comparison Table:
+   | Skill                   | "Can Do" Level  | "Should Do" Level  | Matching Score |
+   |--------------------------|----------------|--------------------|----------------|
+5. Overall Matching Score: [Percentage]
+6. Analysis of Strengths and Weaknesses
+7. Recommendations for Improvement
+8. Conclusion on Fitment
+                    """
+                    # Generate the report
+                    report_content = get_gemini_response(input_prompt)
+                    if report_content:
+                        # Extract the matching score safely
+                        try:
+                            matching_score = float(report_content.split("Overall Matching Score:")[1].split("%")[0].strip())
+                        except (IndexError, ValueError):
+                            matching_score = 0.0
+                            report_content += "\n\n[ERROR: Matching Score could not be parsed]"
+                        # Generate PDF report
+                        report_file = generate_pdf_report(candidate_name, report_content)
+                        # Save results
+                        fitment_results.append((candidate_name, matching_score, report_file))
+                # Sort results by matching score in descending order
+                fitment_results.sort(key=lambda x: x[1], reverse=True)
+                # Display results in tabular form
+                st.write("### Fitment Results")
+                st.write("Below are the shortlisted candidates ranked by their fitment scores.")
+                for rank, (candidate_name, matching_score, report_file) in enumerate(fitment_results, start=1):
+                    col1, col2, col3, col4 = st.columns([3, 2, 2, 2])
+                    col1.write(candidate_name)
+                    col2.write(f"{matching_score:.2f}%")
+                    col3.write(f"Rank {rank}")
+                    with open(report_file, "rb") as f:
+                        col4.download_button(
+                            label="Download Report",
+                            data=f,
+                            file_name=os.path.basename(report_file),
+                            mime="application/pdf",
+                        )
+            except Exception as e:
+                st.error(f"Error generating fitment reports: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+streamlit==1.38.0
+langchain-community==0.2.16
+langchain-text-splitters==0.2.4
+langchain-chroma==0.1.3
+langchain-huggingface==0.0.3
+langchain-groq==0.1.9
+unstructured==0.15.0
+unstructured[pdf]==0.15.0
+nltk==3.8.1
+psycopg2-binary
+pgvector
+langchain_postgres
+docx2txt
+PyPDF2

vectorize_documents.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from langchain_community.document_loaders import UnstructuredFileLoader
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+# # Define a function to perform vectorization
+def vectorize_documents():
+    embeddings = HuggingFaceEmbeddings()
+    loader = DirectoryLoader(
+        path="cv_data",
+        glob="./*.pdf",
+        loader_cls=UnstructuredFileLoader
+    )
+    documents = loader.load()
+    # Splitting the text and creating chunks of these documents.
+    text_splitter = CharacterTextSplitter(
+        chunk_size=2000,
+        chunk_overlap=500
+    )
+    text_chunks = text_splitter.split_documents(documents)
+        # Store in Chroma vector DB
+    vectordb = Chroma.from_documents(
+        documents=text_chunks,
+        embedding=embeddings,
+        persist_directory="cv_vectordb"
+    )
+    print("Documents Vectorized and saved in VectorDB")
+# Expose embeddings if needed
+embeddings = HuggingFaceEmbeddings()
+# Main guard to prevent execution on import
+if __name__ == "__main__":
+    vectorize_documents()