|
import logging |
|
import os |
|
import docx |
|
import PyPDF2 |
|
from docx.shared import RGBColor, Pt |
|
from io import BytesIO, IOBase |
|
import tempfile |
|
import re |
|
import datetime |
|
import torch |
|
|
|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import huggingface_hub |
|
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s" |
|
) |
|
logger = logging.getLogger("LLM-Legal-App") |
|
|
|
|
|
|
|
|
|
def initialize_model(): |
|
"""Initialize the phi-2 model and tokenizer from HuggingFace.""" |
|
logger.info("Initializing phi-2 model and tokenizer...") |
|
try: |
|
|
|
|
|
|
|
model_name = "microsoft/phi-2" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
logger.info("Successfully initialized phi-2 model and tokenizer.") |
|
return model, tokenizer |
|
except Exception as e: |
|
logger.exception("Error initializing Hugging Face model.") |
|
raise ValueError(f"Failed to initialize model: {e}") |
|
|
|
|
|
model, tokenizer = initialize_model() |
|
|
|
|
|
|
|
|
|
def generate_with_model(prompt, max_length=1400, temperature=0.3): |
|
"""Generate text using the Hugging Face model.""" |
|
logger.info("Generating text with phi-2 model.") |
|
|
|
try: |
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
generation_config = { |
|
"max_new_tokens": max_length, |
|
"temperature": temperature, |
|
"top_p": 0.9, |
|
"do_sample": temperature > 0, |
|
"pad_token_id": tokenizer.eos_token_id |
|
} |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate(**inputs, **generation_config) |
|
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
if response.startswith(prompt): |
|
response = response[len(prompt):].strip() |
|
|
|
logger.info("Text generation complete.") |
|
return response |
|
|
|
except Exception as e: |
|
logger.exception("Error during text generation.") |
|
return f"Error generating text: {e}" |
|
|
|
def generate_legal_document(doc_type, party_a, party_b, context, country): |
|
""" |
|
Uses DocumentCogito to generate a legal document. Returns the document text. |
|
""" |
|
logger.info(f"Starting generation for doc_type={doc_type!r}.") |
|
|
|
party_a = party_a if party_a else "[Party A Not Provided]" |
|
party_b = party_b if party_b else "[Party B Not Provided]" |
|
context = context if context else "[Context Not Provided]" |
|
|
|
prompt = f""" |
|
You are a helpful legal assistant. Generate a {doc_type} for: |
|
1) {party_a} |
|
2) {party_b} |
|
|
|
Context/brief of the agreement: |
|
{context}. |
|
|
|
The document should include: |
|
- Purpose of the {doc_type} |
|
- Responsibilities and obligations of each party |
|
- Confidentiality terms |
|
- Payment terms (use [To Be Determined] if not specified) |
|
- Term (duration) and termination |
|
- Governing law: {country} |
|
- Jurisdiction: [Appropriate region in {country} if not provided] |
|
- Signature blocks |
|
|
|
Use formal language, but keep it relatively clear and readable. |
|
For any missing information, use placeholders like [To Be Determined]. |
|
Include a disclaimer that this is a draft and not legally binding until reviewed and signed. |
|
""" |
|
logger.debug(f"Generated prompt:\n{prompt}") |
|
|
|
return generate_with_model(prompt, max_length=1400, temperature=0.3) |
|
|
|
def review_legal_document(doc_text, doc_type, party_a, party_b): |
|
""" |
|
Reviews document: first with rule-based checks, then wording analysis. |
|
""" |
|
logger.info("Starting document review (rule-based and wording).") |
|
|
|
|
|
rule_based_prompt = f""" |
|
You are a legal AI assistant reviewing a document. Provide a review, |
|
structured into the following numbered sections. Be concise and factual. Do NOT |
|
use Markdown. Use plain text labels for each section. |
|
|
|
Document text: |
|
\"\"\" |
|
{doc_text} |
|
\"\"\" |
|
|
|
Review Sections: |
|
|
|
1) Parties and Authority: |
|
- Confirm the full legal names of all parties. |
|
- Make sure the people signing can legally commit their organizations. |
|
|
|
2) Scope of Work / Obligations: |
|
- Check that the contract clearly describes what each side must do. |
|
- Look for deadlines, milestones, or deliverables. |
|
- Ensure everything is realistic and not overly vague. |
|
|
|
3) Definitions and Key Terms: |
|
- See if there's a section that explains important terms. |
|
- Ensure those terms are used the same way throughout the contract. |
|
- Avoid or clarify any ambiguous language. |
|
|
|
4) Payment Terms (If Applicable): |
|
- Check how much is owed, the currency, and when it's due. |
|
- Look for penalties, interest, or late fees. |
|
- Note how and when invoices are sent or paid. |
|
|
|
5) Term and Termination: |
|
- Identify when the contract starts and ends. |
|
- Understand how it can be renewed. |
|
- See the conditions and notice required for ending the contract early. |
|
|
|
6) Intellectual Property (IP) Rights: |
|
- Confirm who owns any work created under the agreement. |
|
- Note if licenses are granted for using the IP, and for how long. |
|
|
|
7) Confidentiality and Privacy: |
|
- Check what is considered confidential information. |
|
- Look for exceptions (like already public info). |
|
- See how long the confidentiality rules apply. |
|
|
|
8) Warranties and Representations: |
|
- Note any performance guarantees or quality promises. |
|
- Look for disclaimers (like "as is" clauses). |
|
|
|
9) Indemnification: |
|
- See who will pay legal costs or damages if there's a lawsuit or claim. |
|
- Check any limits on what's covered. |
|
|
|
10) Limitation of Liability: |
|
- Check if there's a maximum amount one side can claim in damages. |
|
- Look for excluded damages, like lost profits. |
|
|
|
11) Dispute Resolution and Governing Law: |
|
- See if disputes go to arbitration, mediation, or court. |
|
- Note which state or country's laws will apply. |
|
|
|
12) Force Majeure (Unforeseen Events): |
|
- Look for events like natural disasters or war that could suspend obligations. |
|
- See if there are notice requirements for these events. |
|
|
|
13) Notices and Amendments: |
|
- Check how official notices must be sent (email, mail, etc.). |
|
- Find out how to properly change the contract (in writing, signatures, etc.). |
|
|
|
14) Entire Agreement and Severability: |
|
- Confirm that this contract replaces all previous agreements. |
|
- Ensure that if one clause is invalid, the rest still stands. |
|
|
|
15) Signatures and Dates: |
|
- Make sure the right people sign in their proper roles. |
|
- Verify the date of signature and when the contract goes into effect. |
|
|
|
16) Ambiguities, Contradictions, and Hidden Clauses: |
|
- Watch for contradictory statements or clauses that conflict. |
|
- Beware of vague phrases like "best efforts" without clear guidelines. |
|
- Check for hidden or "buried" clauses in fine print or attachments. |
|
|
|
17) Compliance and Regulatory Alignment: |
|
- Ensure the contract follows relevant laws and rules. |
|
- Check for industry-specific requirements. |
|
|
|
18) Practical Considerations: |
|
- Make sure deadlines and other requirements are doable. |
|
- Confirm all negotiations are reflected in writing. |
|
- Avoid blank or undefined items (like fees or dates "to be decided"). |
|
""" |
|
logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}") |
|
|
|
try: |
|
rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3) |
|
except Exception as e: |
|
logger.exception("Error during rule-based review.") |
|
return f"Error during rule-based review: {e}" |
|
|
|
|
|
wording_analysis_prompt = f""" |
|
You are a legal AI assistant. Analyze the following legal document for its wording: |
|
|
|
Document text: |
|
\"\"\" |
|
{doc_text} |
|
\"\"\" |
|
|
|
Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text: |
|
|
|
1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements. |
|
2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity. |
|
3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed. |
|
4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies. |
|
5. **Redundancy:** Identify any unnecessary repetition of words or phrases. |
|
6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate. |
|
7. **Overall Recommendations:** Give overall recommendations for improving the document's wording. |
|
|
|
Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.). |
|
""" |
|
logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}") |
|
|
|
try: |
|
wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3) |
|
except Exception as e: |
|
logger.exception("Error during wording analysis.") |
|
return f"Error during wording analysis: {e}" |
|
|
|
combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}" |
|
return combined_review |
|
|
|
|
|
|
|
|
|
|
|
def parse_bytesio(file_data: BytesIO) -> str: |
|
"""Parses a BytesIO object representing a PDF or DOCX.""" |
|
logger.info("Parsing BytesIO object...") |
|
try: |
|
|
|
try: |
|
doc_obj = docx.Document(file_data) |
|
return "\n".join([para.text for para in doc_obj.paragraphs]).strip() |
|
except docx.opc.exceptions.PackageNotFoundError: |
|
logger.info("BytesIO is not DOCX, trying PDF.") |
|
file_data.seek(0) |
|
try: |
|
pdf_reader = PyPDF2.PdfReader(file_data) |
|
return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip() |
|
except Exception as e: |
|
logger.exception(f"Error parsing BytesIO as PDF: {e}") |
|
return f"Error parsing BytesIO as PDF: {e}" |
|
except Exception as e: |
|
logger.exception(f"Error processing BytesIO: {e}") |
|
return f"Error processing file content: {e}" |
|
except Exception as e: |
|
logger.exception(f"Error parsing BytesIO: {e}") |
|
return f"Error parsing BytesIO: {e}" |
|
|
|
def parse_uploaded_file_path(file_data) -> str: |
|
"""Takes file data, determines type, extracts text.""" |
|
if not file_data: |
|
logger.warning("No file provided.") |
|
return "" |
|
if isinstance(file_data, str): |
|
file_path = file_data |
|
logger.info(f"Received filepath: {file_path}") |
|
elif isinstance(file_data, dict) and 'name' in file_data: |
|
file_path = file_data['name'] |
|
logger.info(f"Received file object with name: {file_path}") |
|
elif isinstance(file_data, (BytesIO, IOBase)): |
|
return parse_bytesio(file_data) |
|
else: |
|
logger.error(f"Unexpected file_data type: {type(file_data)}") |
|
return "Error: Unexpected file data format." |
|
|
|
logger.info(f"Attempting to parse file at {file_path}") |
|
try: |
|
_, ext = os.path.splitext(file_path) |
|
ext = ext.lower() |
|
if ext == ".pdf": |
|
with open(file_path, "rb") as f: |
|
pdf_reader = PyPDF2.PdfReader(f) |
|
return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip() |
|
elif ext == ".docx": |
|
doc_obj = docx.Document(file_path) |
|
return "\n".join([para.text for para in doc_obj.paragraphs]).strip() |
|
else: |
|
return "Unsupported file format." |
|
except Exception as e: |
|
logger.exception(f"Error parsing file: {e}") |
|
return f"Error parsing file: {e}" |
|
finally: |
|
pass |
|
|
|
|
|
|
|
|
|
|
|
def clean_markdown(text): |
|
"""Removes common Markdown formatting.""" |
|
if not text: return "" |
|
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text) |
|
text = re.sub(r'(\*|_)(.*?)(\*|_)', r'\2', text) |
|
text = re.sub(r'^[\-\+\*]\s+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'!\[(.*?)\]\((.*?)\)', '', text) |
|
text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) |
|
return text.strip() |
|
|
|
def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"): |
|
"""Creates DOCX, adds review, saves to temp file, returns path.""" |
|
logger.debug("Creating and saving DOCX.") |
|
document = docx.Document() |
|
|
|
now = datetime.datetime.now() |
|
timestamp = now.strftime("%Y%m%d_%H%M%S") |
|
file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx" |
|
|
|
title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}" |
|
document.add_heading(title, level=1) |
|
|
|
if doc_text: |
|
document.add_heading("Generated Document", level=2) |
|
for para in clean_markdown(doc_text).split("\n"): |
|
document.add_paragraph(para) |
|
|
|
if review_text: |
|
document.add_heading("LLM Review", level=2) |
|
for section in review_text.split("\n\n"): |
|
if section.startswith("Rule-Based Analysis:"): |
|
analysis_heading = document.add_paragraph() |
|
analysis_run = analysis_heading.add_run("Rule-Based Analysis") |
|
analysis_run.font.size = Pt(14) |
|
analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
|
for para in section[len("Rule-Based Analysis:"):].split("\n"): |
|
if re.match(r"^\d+\)", para): |
|
p = document.add_paragraph(style='List Number') |
|
p.add_run(para).font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
|
else: |
|
document.add_paragraph(para) |
|
|
|
elif section.startswith("Wording Analysis:"): |
|
analysis_heading = document.add_paragraph() |
|
analysis_run = analysis_heading.add_run("Wording Analysis") |
|
analysis_run.font.size = Pt(14) |
|
analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) |
|
for para in section[len("Wording Analysis:"):].split("\n"): |
|
document.add_paragraph(para) |
|
else: |
|
document.add_paragraph(section) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile: |
|
document.save(tmpfile.name) |
|
logger.debug(f"DOCX saved to: {tmpfile.name}") |
|
return tmpfile.name |
|
|
|
|
|
|
|
|
|
|
|
def generate_document_interface(doc_type, party_a, party_b, context, country): |
|
"""Handles document generation.""" |
|
logger.info(f"User requested doc generation: {doc_type}, {country}") |
|
doc_text = generate_legal_document(doc_type, party_a, party_b, context, country) |
|
if doc_text.startswith("Error"): |
|
return doc_text, None |
|
docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b) |
|
return doc_text, docx_file_path |
|
|
|
def review_document_interface(file_data, doc_type, party_a, party_b): |
|
"""Handles document review.""" |
|
logger.info("User requested review.") |
|
if not file_data: |
|
return "No file uploaded.", None |
|
|
|
original_text = parse_uploaded_file_path(file_data) |
|
if original_text.startswith("Error") or original_text.startswith("Unsupported"): |
|
return original_text, None |
|
|
|
review_text = review_legal_document(original_text, doc_type, party_a, party_b) |
|
if review_text.startswith("Error"): |
|
return review_text, None |
|
|
|
docx_file_path = create_and_save_docx(None, review_text, doc_type, party_a, party_b) |
|
return review_text, docx_file_path |
|
|
|
|
|
|
|
|
|
|
|
custom_css = """ |
|
.tab-one { |
|
background-color: #D1EEFC; /* Light blue */ |
|
color: #333; |
|
} |
|
.tab-two { |
|
background-color: #FCEED1; /* Light orange */ |
|
color: #333; |
|
} |
|
/* If you want to style the tab label differently, you may need to target |
|
specific child elements (like a .tab__header) within the class. */ |
|
""" |
|
|
|
def build_app(): |
|
with gr.Blocks(css=custom_css) as demo: |
|
gr.Markdown( |
|
""" |
|
# UST Global Legal Document Analyzer (Hugging Face Version) |
|
|
|
**Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis. |
|
|
|
**Disclaimer**: This tool provides assistance but is not a substitute for professional legal advice. |
|
""" |
|
) |
|
with gr.Tabs(selected=1): |
|
with gr.Tab("Generate Document", visible=False): |
|
doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU") |
|
party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC") |
|
party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp") |
|
context = gr.Textbox(label="Context/Brief", placeholder="Short summary of the agreement...") |
|
country = gr.Dropdown(label="Governing Law (Country)", choices=["India", "Malaysia", "US", "UK", "Singapore", "Japan"], value="India") |
|
gen_button = gr.Button("Generate Document") |
|
gen_output_text = gr.Textbox(label="Generated Document", lines=15, placeholder="Generated document will appear here...") |
|
gen_output_file = gr.File(label="Download DOCX", type="filepath") |
|
gen_button.click( |
|
generate_document_interface, |
|
inputs=[doc_type, party_a, party_b, context, country], |
|
outputs=[gen_output_text, gen_output_file] |
|
) |
|
|
|
with gr.Tab("Review Document", elem_classes="tab-one", id=1): |
|
|
|
doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False) |
|
party_a_review = gr.Textbox(label="Party A Name", visible=False) |
|
party_b_review = gr.Textbox(label="Party B Name", visible=False) |
|
|
|
file_input = gr.File(label="Upload PDF/DOCX for Review", type="filepath") |
|
review_button = gr.Button("Review Document") |
|
review_output_text = gr.Textbox(label="Review", lines=15, placeholder="Review will appear here...") |
|
review_output_file = gr.File(label="Download Reviewed DOCX", type="filepath") |
|
review_button.click( |
|
review_document_interface, |
|
inputs=[file_input, doc_type_review, party_a_review, party_b_review], |
|
outputs=[review_output_text, review_output_file] |
|
) |
|
|
|
gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review]) |
|
|
|
gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.") |
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
logger.info("Initializing Gradio interface...") |
|
demo = build_app() |
|
logger.info("Launching Gradio app.") |
|
demo.launch(debug=True,share=False) |