File size: 2,687 Bytes
5c51375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Diff visualization utilities."""

import difflib
from typing import Iterator


def generate_html_diff(original: str, cleaned: str) -> str:
    """Generate HTML diff between original and cleaned text."""
    differ = difflib.HtmlDiff(tabsize=2, wrapcolumn=80)
    
    original_lines = original.splitlines(keepends=True)
    cleaned_lines = cleaned.splitlines(keepends=True)
    
    html_diff = differ.make_file(
        original_lines,
        cleaned_lines,
        "Original Text",
        "Cleaned Text",
        context=True,
        numlines=3
    )
    
    return _style_diff_html(html_diff)


def _style_diff_html(html_diff: str) -> str:
    """Add custom styling to the diff HTML."""
    custom_style = """
    <style>
        .diff {
            font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
            font-size: 12px;
            line-height: 1.4;
            border: 1px solid #ddd;
            border-radius: 8px;
            overflow: auto;
            max-height: 600px;
        }
        .diff table {
            width: 100%;
            border-collapse: collapse;
            margin: 0;
        }
        .diff td {
            padding: 2px 8px;
            vertical-align: top;
            white-space: pre-wrap;
            word-wrap: break-word;
        }
        .diff_header {
            background: #f8f9fa;
            font-weight: bold;
            text-align: center;
            border-bottom: 1px solid #dee2e6;
        }
        .diff_next {
            background: #e9ecef;
            text-align: center;
            font-size: 10px;
        }
        .diff_add {
            background: #d4edda;
            color: #155724;
        }
        .diff_chg {
            background: #fff3cd;
            color: #856404;
        }
        .diff_sub {
            background: #f8d7da;
            color: #721c24;
        }
    </style>
    """
    
    # Insert custom style before closing </head> tag
    styled_html = html_diff.replace('</head>', f'{custom_style}</head>')
    
    # Add diff class to the table
    styled_html = styled_html.replace('<table class="diff"', '<div class="diff"><table class="diff"')
    styled_html = styled_html.replace('</table>', '</table></div>')
    
    return styled_html


def get_diff_stats(original: str, cleaned: str) -> dict[str, int]:
    """Get statistics about the diff between original and cleaned text."""
    differ = difflib.SequenceMatcher(None, original, cleaned)
    
    return {
        'original_length': len(original),
        'cleaned_length': len(cleaned),
        'similarity_ratio': round(differ.ratio() * 100, 2),
        'characters_removed': len(original) - len(cleaned)
    }