import gradio as gr import os import pdfplumber # Updated imports for newer LlamaIndex versions from llama_index.core import Settings, VectorStoreIndex, Document from llama_index.core.node_parser import SimpleNodeParser from llama_index.llms.openai import OpenAI # Get API keys from environment variables OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY def extract_text_from_pdf(pdf_path): text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: extracted = page.extract_text() if extracted: text += extracted return text # Define the prompts for each RAG agent prompts = [ """Generate a concise 150-word summary of the paper, integrating the following points: Introduction and Motivation, Proposed Solution, Methodology, Experiments and Results, Statistical Findings, Shortcomings, Future Work, and Conclusion. Ensure that statistical results are highlighted where relevant. Do not explicitly name the sections; instead, weave the information together into a cohesive narrative.""", """Provide a comprehensive overview of the research context and background. Include the current state of the field, the gap or challenge the research addresses, and the significance of this problem within the broader discipline. Highlight the importance of the statistical methodologies used and why this research is statistically significant.""", """Identify and describe the core contribution or innovation introduced by this research. What new approach, model, or framework is being proposed? Highlight the aspects that distinguish this work from existing methods, especially focusing on the statistical techniques or novel empirical findings that strengthen the contribution.""", """Explain the methodologies, models, or frameworks employed in this research in detail. Describe how the proposed solution was developed and implemented, including specific algorithms, statistical techniques, and processes involved. Provide sufficient technical and statistical detail to allow for a clear understanding of the approach.""", """Outline the experimental design and setup used to validate the research, including detailed information on the datasets (e.g., size, source, pre-processing), specific tasks or benchmarks, and the evaluation metrics employed. Present the numerical and statistical results, emphasizing key findings, performance metrics, and comparisons with existing methods or baselines. Highlight any significant p-values, confidence intervals, or other statistical metrics that are central to the findings.""", """Conduct an in-depth analysis of the experimental results. Discuss the implications of the findings, including any patterns, insights, or unexpected outcomes. Evaluate the strengths and limitations of the results, particularly focusing on statistical validity. Compare these findings with theoretical expectations or human judgment, emphasizing the statistical significance of the results.""", """Summarize the overall conclusions and their statistical significance. Identify any limitations or areas where the research could be improved, particularly in terms of statistical methodologies. Suggest potential solutions. Discuss possible future directions for research, including new questions, hypotheses, or statistical challenges that emerged from the study's findings.""" ] # Function to run a RAG agent with a specific prompt def run_rag_agent(index, prompt): query_engine = index.as_query_engine() response = query_engine.query(prompt) # Depending on your llama_index version, the `response` might already be a string. # If it's an object, you might need to do something like `response.response`. return response.response if hasattr(response, "response") else str(response) # Function to run the final RAG agent using the LLM directly def run_final_rag_agent(context, prompt, temperature): llm = OpenAI(temperature=temperature, model="gpt-4o-mini") full_prompt = f"{prompt}\n\nContext:\n{context}" response = llm.complete(full_prompt) return response.text def process_pdf(pdf_path, temperature): try: # Extract text from PDF pdf_text = extract_text_from_pdf(pdf_path) # Create a Document object with the new constructor syntax documents = [Document(text=pdf_text)] # Parse the document into nodes using SimpleNodeParser parser = SimpleNodeParser() nodes = parser.get_nodes_from_documents(documents) # Create the base index from the nodes index = VectorStoreIndex(nodes) # Run each RAG agent and store the results rag_outputs = [] for prompt in prompts: output = run_rag_agent(index, prompt) rag_outputs.append(output) # Combine all outputs for the final RAG agent combined_output = "\n\n".join(rag_outputs) final_prompt = """Integrate the outputs into a seamless and cohesive technical article. Carefully identify and remove any redundant or repeated information across the different sections, while preserving all essential statistical and empirical details. Ensure smooth and logical transitions between sections to maintain a coherent narrative flow. It is critical to avoid omitting any key information, particularly statistical findings or empirical results. The final article should be articulated in a clear, eloquent, and authoritative tone, as if delivered by a professor who has deeply engaged with the research paper and is conveying its content to students with a strong focus on statistical rigor. Ensure the article reads as a polished and professional technical piece, with an emphasis on statistical analysis and interpretation.""" final_article = run_final_rag_agent(combined_output, final_prompt, temperature) return final_article except Exception as e: return f"Error processing PDF: {str(e)}" def main(pdf, temperature): if pdf is not None: return process_pdf(pdf.name, temperature) else: return "No file uploaded or an error occurred while uploading the file." def main(pdf, temperature): if pdf is not None: return process_pdf(pdf.name, temperature) else: return "No file uploaded or an error occurred while uploading the file." # Gradio interface interface = gr.Interface( fn=main, inputs=[ gr.File(label="Upload PDF"), gr.Slider(minimum=0, maximum=1, value=0.2, label="Temperature (Recommended = 0.2)") ], outputs="text", title="RAG-based Research Paper to Article", description="""Upload a Research Paper PDF file to generate a summarized technical article using RAG agents. Please note that the first paragraph is a mini summary, the actual article starts from the second paragraph onward. Keeping the temperature closer to 0 often ensures most information from the paper is covered because it reduces creativity.""" ) if __name__ == "__main__": interface.launch()