Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	create streamlit app
Browse files- app.py +286 -0
- requirements.txt +132 -0
- scrape-content.ipynb +3 -1
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,286 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import base64
         | 
| 2 | 
            +
            import copy
         | 
| 3 | 
            +
            import json
         | 
| 4 | 
            +
            from collections import Counter
         | 
| 5 | 
            +
            from urllib.parse import urljoin
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import streamlit as st
         | 
| 8 | 
            +
            from bs4 import BeautifulSoup
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            def remove_svg_elements(element):
         | 
| 12 | 
            +
                """
         | 
| 13 | 
            +
                Remove all SVG elements from a BeautifulSoup element.
         | 
| 14 | 
            +
                Returns a copy of the element with SVGs removed.
         | 
| 15 | 
            +
                """
         | 
| 16 | 
            +
                # Create a copy of the element to avoid modifying the original
         | 
| 17 | 
            +
                element_copy = copy.copy(element)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                # Find and remove all SVG elements
         | 
| 20 | 
            +
                if hasattr(element_copy, 'find_all'):
         | 
| 21 | 
            +
                    svg_elements = element_copy.find_all('svg')
         | 
| 22 | 
            +
                    for svg in svg_elements:
         | 
| 23 | 
            +
                        svg.decompose()
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                return element_copy
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            def get_element_signature(element):
         | 
| 28 | 
            +
                """
         | 
| 29 | 
            +
                Create a signature for an element based on its structure.
         | 
| 30 | 
            +
                """
         | 
| 31 | 
            +
                signature = {
         | 
| 32 | 
            +
                    'tag': element.name,
         | 
| 33 | 
            +
                    'classes': tuple(sorted(element.get('class', []))),
         | 
| 34 | 
            +
                    'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
         | 
| 35 | 
            +
                    'has_image': bool(element.find('img')),
         | 
| 36 | 
            +
                    'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
         | 
| 37 | 
            +
                    'has_link': bool(element.find('a')),
         | 
| 38 | 
            +
                }
         | 
| 39 | 
            +
                return str(signature)
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            def analyze_children_similarity(element):
         | 
| 42 | 
            +
                """
         | 
| 43 | 
            +
                Analyze how similar the direct children of an element are.
         | 
| 44 | 
            +
                """
         | 
| 45 | 
            +
                if not element.contents:
         | 
| 46 | 
            +
                    return 0, 0
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                child_signatures = [
         | 
| 49 | 
            +
                    get_element_signature(child)
         | 
| 50 | 
            +
                    for child in element.find_all(recursive=False)
         | 
| 51 | 
            +
                    if child.name
         | 
| 52 | 
            +
                ]
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                if not child_signatures:
         | 
| 55 | 
            +
                    return 0, 0
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                signature_counts = Counter(child_signatures)
         | 
| 58 | 
            +
                most_common_sig, most_common_count = signature_counts.most_common(1)[0]
         | 
| 59 | 
            +
                similarity_score = most_common_count / len(child_signatures)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                return similarity_score, most_common_count
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            def count_images_in_element(element):
         | 
| 64 | 
            +
                """
         | 
| 65 | 
            +
                Count all images within an element, including nested ones.
         | 
| 66 | 
            +
                """
         | 
| 67 | 
            +
                return len(element.find_all('img', recursive=True))
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            def get_element_identifier(element):
         | 
| 70 | 
            +
                """
         | 
| 71 | 
            +
                Create a unique identifier for an element including tag and classes.
         | 
| 72 | 
            +
                """
         | 
| 73 | 
            +
                identifier = element.name
         | 
| 74 | 
            +
                if element.get('class'):
         | 
| 75 | 
            +
                    identifier += f" .{' .'.join(element['class'])}"
         | 
| 76 | 
            +
                if element.get('id'):
         | 
| 77 | 
            +
                    identifier += f" #{element['id']}"
         | 
| 78 | 
            +
                return identifier
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            def convert_relative_urls(soup, base_url):
         | 
| 81 | 
            +
                """
         | 
| 82 | 
            +
                Convert all relative URLs in the soup object to absolute URLs.
         | 
| 83 | 
            +
                """
         | 
| 84 | 
            +
                for tag in soup.find_all(href=True):
         | 
| 85 | 
            +
                    tag['href'] = urljoin(base_url, tag['href'])
         | 
| 86 | 
            +
                for tag in soup.find_all(src=True):
         | 
| 87 | 
            +
                    tag['src'] = urljoin(base_url, tag['src'])
         | 
| 88 | 
            +
                for tag in soup.find_all(attrs={'data-src': True}):
         | 
| 89 | 
            +
                    tag['data-src'] = urljoin(base_url, tag['data-src'])
         | 
| 90 | 
            +
                return soup
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
         | 
| 93 | 
            +
                """
         | 
| 94 | 
            +
                Find elements containing images and return both sorted list and detailed top element info.
         | 
| 95 | 
            +
                """
         | 
| 96 | 
            +
                soup = BeautifulSoup(html_content, "html.parser")
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                # Convert relative URLs to absolute if base_url is provided
         | 
| 99 | 
            +
                if base_url:
         | 
| 100 | 
            +
                    soup = convert_relative_urls(soup, base_url)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                # Collect potential container elements with their scores
         | 
| 103 | 
            +
                elements_with_scores = []
         | 
| 104 | 
            +
                for element in soup.find_all():
         | 
| 105 | 
            +
                    if element.name in ['div', 'ul', 'section', 'main']:
         | 
| 106 | 
            +
                        similarity_score, similar_children_count = analyze_children_similarity(element)
         | 
| 107 | 
            +
                        image_count = count_images_in_element(element)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                        if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
         | 
| 110 | 
            +
                            # Count products (direct children with images)
         | 
| 111 | 
            +
                            products_count = len([child for child in element.find_all(recursive=False)
         | 
| 112 | 
            +
                                                  if child.name and child.find('img', recursive=True)])
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                            combined_score = (similarity_score * similar_children_count * image_count)
         | 
| 115 | 
            +
                            elements_with_scores.append((element, image_count, combined_score, products_count))
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                if not elements_with_scores:
         | 
| 118 | 
            +
                    return [], {"error": "No elements with images found"}, ""
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                # Sort by combined score
         | 
| 121 | 
            +
                elements_with_scores.sort(key=lambda x: x[2], reverse=True)
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                # Process elements for sorted list output
         | 
| 124 | 
            +
                sorted_elements = []
         | 
| 125 | 
            +
                for element, image_count, _, products_count in elements_with_scores:
         | 
| 126 | 
            +
                    sorted_elements.append((get_element_identifier(element), image_count, products_count))
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                # Get top element (one with highest combined score)
         | 
| 129 | 
            +
                top_element = elements_with_scores[0][0]
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                # Remove SVGs from the top element for HTML output
         | 
| 132 | 
            +
                top_element_no_svg = remove_svg_elements(top_element)
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                # Separate child elements with images
         | 
| 135 | 
            +
                products = []
         | 
| 136 | 
            +
                for child in top_element_no_svg.find_all(recursive=False):
         | 
| 137 | 
            +
                    if child.name:  # Skip text nodes
         | 
| 138 | 
            +
                        # Remove SVGs from each product
         | 
| 139 | 
            +
                        child_no_svg = remove_svg_elements(child)
         | 
| 140 | 
            +
                        product_info = {
         | 
| 141 | 
            +
                            "html_content": str(child_no_svg),
         | 
| 142 | 
            +
                            "images": []
         | 
| 143 | 
            +
                        }
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                        # Get all images within this product
         | 
| 146 | 
            +
                        for img in child_no_svg.find_all('img', recursive=True):
         | 
| 147 | 
            +
                            image_info = {
         | 
| 148 | 
            +
                                "src": img.get('src', 'No source'),
         | 
| 149 | 
            +
                                "alt": img.get('alt', 'No alt text')
         | 
| 150 | 
            +
                            }
         | 
| 151 | 
            +
                            product_info["images"].append(image_info)
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                        products.append(product_info)
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                # Create result dictionary for top element
         | 
| 156 | 
            +
                top_element_info = {
         | 
| 157 | 
            +
                    "parent": {
         | 
| 158 | 
            +
                        "tag": top_element_no_svg.name,
         | 
| 159 | 
            +
                        "identifier": get_element_identifier(top_element_no_svg),
         | 
| 160 | 
            +
                        "classes": top_element_no_svg.get('class', []),
         | 
| 161 | 
            +
                        "id": top_element_no_svg.get('id', None)
         | 
| 162 | 
            +
                    },
         | 
| 163 | 
            +
                    "products_count": len(products),
         | 
| 164 | 
            +
                    "products": products
         | 
| 165 | 
            +
                }
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                html_output = str(top_element_no_svg)
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                return sorted_elements, top_element_info, html_output
         | 
| 170 | 
            +
             | 
| 171 | 
            +
            def get_download_link(content, filename, content_type="file/json"):
         | 
| 172 | 
            +
                """Generate a download link for the given content"""
         | 
| 173 | 
            +
                b64 = base64.b64encode(content.encode()).decode()
         | 
| 174 | 
            +
                return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
         | 
| 175 | 
            +
             | 
| 176 | 
            +
            def main():
         | 
| 177 | 
            +
                st.title("HTML File Analyzer")
         | 
| 178 | 
            +
                st.write("Upload HTML files to analyze their structure and find image-rich elements")
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                # File uploader allows multiple files
         | 
| 181 | 
            +
                uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                if uploaded_files:
         | 
| 184 | 
            +
                    all_results = {}
         | 
| 185 | 
            +
                    all_html_outputs = {}
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    # Analysis parameters
         | 
| 188 | 
            +
                    col1, col2 = st.columns(2)
         | 
| 189 | 
            +
                    with col1:
         | 
| 190 | 
            +
                        min_children = st.slider("Minimum number of similar children", 1, 10, 4)
         | 
| 191 | 
            +
                    with col2:
         | 
| 192 | 
            +
                        min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                    # Generate button
         | 
| 195 | 
            +
                    if st.button("Generate Analysis"):
         | 
| 196 | 
            +
                        # Show processing message
         | 
| 197 | 
            +
                        with st.spinner('Processing files...'):
         | 
| 198 | 
            +
                            all_results = {}
         | 
| 199 | 
            +
                            all_html_outputs = {}
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                            # Process each file
         | 
| 202 | 
            +
                            for uploaded_file in uploaded_files:
         | 
| 203 | 
            +
                                st.subheader(f"Analysis for {uploaded_file.name}")
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                                try:
         | 
| 206 | 
            +
                                    # Read and process the file
         | 
| 207 | 
            +
                                    html_content = uploaded_file.read().decode('utf-8')
         | 
| 208 | 
            +
                                    sorted_elements, top_element_info, html_output = find_image_rich_parents(
         | 
| 209 | 
            +
                                        html_content,
         | 
| 210 | 
            +
                                        min_children=min_children,
         | 
| 211 | 
            +
                                        min_similarity=min_similarity
         | 
| 212 | 
            +
                                    )
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                                    # Display results
         | 
| 215 | 
            +
                                    st.write("Elements containing images:")
         | 
| 216 | 
            +
                                    for element, img_count, prod_count in sorted_elements:
         | 
| 217 | 
            +
                                        st.write(f"- {element}: {img_count} images, {prod_count} products")
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                                    # Store results
         | 
| 220 | 
            +
                                    all_results[uploaded_file.name] = top_element_info
         | 
| 221 | 
            +
                                    all_html_outputs[uploaded_file.name] = html_output
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                                except Exception as e:
         | 
| 224 | 
            +
                                    st.error(f"Error processing {uploaded_file.name}: {str(e)}")
         | 
| 225 | 
            +
                                    continue
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                            # Create download buttons if we have results
         | 
| 228 | 
            +
                            if all_results:
         | 
| 229 | 
            +
                                st.subheader("Download Results")
         | 
| 230 | 
            +
                                col1, col2 = st.columns(2)
         | 
| 231 | 
            +
             | 
| 232 | 
            +
                                # JSON download
         | 
| 233 | 
            +
                                with col1:
         | 
| 234 | 
            +
                                    json_str = json.dumps(all_results, indent=2)
         | 
| 235 | 
            +
                                    st.markdown(get_download_link(json_str, 'analysis_results.json'),
         | 
| 236 | 
            +
                                                unsafe_allow_html=True)
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                                # HTML download
         | 
| 239 | 
            +
                                with col2:
         | 
| 240 | 
            +
                                    # Combine all HTML outputs with file names as headers
         | 
| 241 | 
            +
                                    combined_html = """
         | 
| 242 | 
            +
                                        <!DOCTYPE html>
         | 
| 243 | 
            +
                                        <html>
         | 
| 244 | 
            +
                                        <head>
         | 
| 245 | 
            +
                                            <meta charset='UTF-8'>
         | 
| 246 | 
            +
                                            <style>
         | 
| 247 | 
            +
                                                div {
         | 
| 248 | 
            +
                                                    width: auto !important;
         | 
| 249 | 
            +
                                                    height: auto !important;
         | 
| 250 | 
            +
                                                    padding: 0 !important;
         | 
| 251 | 
            +
                                                    margin: 0 !important;
         | 
| 252 | 
            +
                                                }
         | 
| 253 | 
            +
                                                img {
         | 
| 254 | 
            +
                                                    width: 300px;
         | 
| 255 | 
            +
                                                    height: 300px;
         | 
| 256 | 
            +
                                                    object-fit: contain;
         | 
| 257 | 
            +
                                                }
         | 
| 258 | 
            +
                                                body { font-family: Arial, sans-serif; }
         | 
| 259 | 
            +
                                                .file-section { margin: 20px 0; }
         | 
| 260 | 
            +
                                                .file-header { 
         | 
| 261 | 
            +
                                                    background: #f0f0f0; 
         | 
| 262 | 
            +
                                                    padding: 10px; 
         | 
| 263 | 
            +
                                                    margin: 20px 0;
         | 
| 264 | 
            +
                                                }       
         | 
| 265 | 
            +
                                            </style>
         | 
| 266 | 
            +
                                        </head>
         | 
| 267 | 
            +
                                        <body>
         | 
| 268 | 
            +
                                        """
         | 
| 269 | 
            +
                                    for filename, html in all_html_outputs.items():
         | 
| 270 | 
            +
                                        combined_html += f"""
         | 
| 271 | 
            +
                                            <div class="file-section">
         | 
| 272 | 
            +
                                                <h2 class="file-header">{filename}</h2>
         | 
| 273 | 
            +
                                                {html}
         | 
| 274 | 
            +
                                            </div>
         | 
| 275 | 
            +
                                            """
         | 
| 276 | 
            +
                                    combined_html += "</body></html>"
         | 
| 277 | 
            +
             | 
| 278 | 
            +
                                    st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
         | 
| 279 | 
            +
                                                unsafe_allow_html=True)
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                                # Success message
         | 
| 282 | 
            +
                                st.success("Analysis completed successfully!")
         | 
| 283 | 
            +
             | 
| 284 | 
            +
             | 
| 285 | 
            +
            if __name__ == "__main__":
         | 
| 286 | 
            +
                main()
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,132 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            altair==5.4.1
         | 
| 2 | 
            +
            anyio==4.6.2.post1
         | 
| 3 | 
            +
            argon2-cffi==23.1.0
         | 
| 4 | 
            +
            argon2-cffi-bindings==21.2.0
         | 
| 5 | 
            +
            arrow==1.3.0
         | 
| 6 | 
            +
            asttokens==2.4.1
         | 
| 7 | 
            +
            async-lru==2.0.4
         | 
| 8 | 
            +
            attrs==24.2.0
         | 
| 9 | 
            +
            babel==2.16.0
         | 
| 10 | 
            +
            beautifulsoup4==4.12.3
         | 
| 11 | 
            +
            black==24.10.0
         | 
| 12 | 
            +
            bleach==6.1.0
         | 
| 13 | 
            +
            blinker==1.8.2
         | 
| 14 | 
            +
            bs4==0.0.2
         | 
| 15 | 
            +
            cachetools==5.5.0
         | 
| 16 | 
            +
            certifi==2024.8.30
         | 
| 17 | 
            +
            cffi==1.17.1
         | 
| 18 | 
            +
            charset-normalizer==3.4.0
         | 
| 19 | 
            +
            click==8.1.7
         | 
| 20 | 
            +
            comm==0.2.2
         | 
| 21 | 
            +
            debugpy==1.8.7
         | 
| 22 | 
            +
            decorator==5.1.1
         | 
| 23 | 
            +
            defusedxml==0.7.1
         | 
| 24 | 
            +
            executing==2.1.0
         | 
| 25 | 
            +
            fake-headers==1.0.2
         | 
| 26 | 
            +
            fastjsonschema==2.20.0
         | 
| 27 | 
            +
            fqdn==1.5.1
         | 
| 28 | 
            +
            gitdb==4.0.11
         | 
| 29 | 
            +
            GitPython==3.1.43
         | 
| 30 | 
            +
            h11==0.14.0
         | 
| 31 | 
            +
            html5lib==1.1
         | 
| 32 | 
            +
            httpcore==1.0.6
         | 
| 33 | 
            +
            httpx==0.27.2
         | 
| 34 | 
            +
            idna==3.10
         | 
| 35 | 
            +
            ipykernel==6.29.5
         | 
| 36 | 
            +
            ipython==8.28.0
         | 
| 37 | 
            +
            ipywidgets==8.1.5
         | 
| 38 | 
            +
            isoduration==20.11.0
         | 
| 39 | 
            +
            jedi==0.19.1
         | 
| 40 | 
            +
            Jinja2==3.1.4
         | 
| 41 | 
            +
            json5==0.9.25
         | 
| 42 | 
            +
            jsonpointer==3.0.0
         | 
| 43 | 
            +
            jsonschema==4.23.0
         | 
| 44 | 
            +
            jsonschema-specifications==2024.10.1
         | 
| 45 | 
            +
            jupyter==1.1.1
         | 
| 46 | 
            +
            jupyter-console==6.6.3
         | 
| 47 | 
            +
            jupyter-events==0.10.0
         | 
| 48 | 
            +
            jupyter-lsp==2.2.5
         | 
| 49 | 
            +
            jupyter_client==8.6.3
         | 
| 50 | 
            +
            jupyter_core==5.7.2
         | 
| 51 | 
            +
            jupyter_server==2.14.2
         | 
| 52 | 
            +
            jupyter_server_terminals==0.5.3
         | 
| 53 | 
            +
            jupyterlab==4.2.5
         | 
| 54 | 
            +
            jupyterlab_pygments==0.3.0
         | 
| 55 | 
            +
            jupyterlab_server==2.27.3
         | 
| 56 | 
            +
            jupyterlab_widgets==3.0.13
         | 
| 57 | 
            +
            markdown-it-py==3.0.0
         | 
| 58 | 
            +
            MarkupSafe==3.0.2
         | 
| 59 | 
            +
            matplotlib-inline==0.1.7
         | 
| 60 | 
            +
            mdurl==0.1.2
         | 
| 61 | 
            +
            mistune==3.0.2
         | 
| 62 | 
            +
            mypy-extensions==1.0.0
         | 
| 63 | 
            +
            narwhals==1.11.0
         | 
| 64 | 
            +
            nbclient==0.10.0
         | 
| 65 | 
            +
            nbconvert==7.16.4
         | 
| 66 | 
            +
            nbformat==5.10.4
         | 
| 67 | 
            +
            nest-asyncio==1.6.0
         | 
| 68 | 
            +
            notebook==7.2.2
         | 
| 69 | 
            +
            notebook_shim==0.2.4
         | 
| 70 | 
            +
            numpy==2.1.2
         | 
| 71 | 
            +
            outcome==1.3.0.post0
         | 
| 72 | 
            +
            overrides==7.7.0
         | 
| 73 | 
            +
            packaging==24.1
         | 
| 74 | 
            +
            pandas==2.2.3
         | 
| 75 | 
            +
            pandocfilters==1.5.1
         | 
| 76 | 
            +
            parso==0.8.4
         | 
| 77 | 
            +
            pathspec==0.12.1
         | 
| 78 | 
            +
            pexpect==4.9.0
         | 
| 79 | 
            +
            pillow==10.4.0
         | 
| 80 | 
            +
            platformdirs==4.3.6
         | 
| 81 | 
            +
            prometheus_client==0.21.0
         | 
| 82 | 
            +
            prompt_toolkit==3.0.48
         | 
| 83 | 
            +
            protobuf==5.28.3
         | 
| 84 | 
            +
            psutil==6.1.0
         | 
| 85 | 
            +
            ptyprocess==0.7.0
         | 
| 86 | 
            +
            pure_eval==0.2.3
         | 
| 87 | 
            +
            pyarrow==17.0.0
         | 
| 88 | 
            +
            pycparser==2.22
         | 
| 89 | 
            +
            pydeck==0.9.1
         | 
| 90 | 
            +
            Pygments==2.18.0
         | 
| 91 | 
            +
            PySocks==1.7.1
         | 
| 92 | 
            +
            python-dateutil==2.9.0.post0
         | 
| 93 | 
            +
            python-json-logger==2.0.7
         | 
| 94 | 
            +
            pytz==2024.2
         | 
| 95 | 
            +
            PyYAML==6.0.2
         | 
| 96 | 
            +
            pyzmq==26.2.0
         | 
| 97 | 
            +
            referencing==0.35.1
         | 
| 98 | 
            +
            requests==2.32.3
         | 
| 99 | 
            +
            rfc3339-validator==0.1.4
         | 
| 100 | 
            +
            rfc3986-validator==0.1.1
         | 
| 101 | 
            +
            rich==13.9.3
         | 
| 102 | 
            +
            rpds-py==0.20.0
         | 
| 103 | 
            +
            selenium==4.25.0
         | 
| 104 | 
            +
            Send2Trash==1.8.3
         | 
| 105 | 
            +
            setuptools==75.2.0
         | 
| 106 | 
            +
            six==1.16.0
         | 
| 107 | 
            +
            smmap==5.0.1
         | 
| 108 | 
            +
            sniffio==1.3.1
         | 
| 109 | 
            +
            sortedcontainers==2.4.0
         | 
| 110 | 
            +
            soupsieve==2.6
         | 
| 111 | 
            +
            stack-data==0.6.3
         | 
| 112 | 
            +
            streamlit==1.39.0
         | 
| 113 | 
            +
            tenacity==9.0.0
         | 
| 114 | 
            +
            terminado==0.18.1
         | 
| 115 | 
            +
            tinycss2==1.4.0
         | 
| 116 | 
            +
            toml==0.10.2
         | 
| 117 | 
            +
            tornado==6.4.1
         | 
| 118 | 
            +
            traitlets==5.14.3
         | 
| 119 | 
            +
            trio==0.27.0
         | 
| 120 | 
            +
            trio-websocket==0.11.1
         | 
| 121 | 
            +
            types-python-dateutil==2.9.0.20241003
         | 
| 122 | 
            +
            typing_extensions==4.12.2
         | 
| 123 | 
            +
            tzdata==2024.2
         | 
| 124 | 
            +
            uri-template==1.3.0
         | 
| 125 | 
            +
            urllib3==2.2.3
         | 
| 126 | 
            +
            watchdog==5.0.3
         | 
| 127 | 
            +
            wcwidth==0.2.13
         | 
| 128 | 
            +
            webcolors==24.8.0
         | 
| 129 | 
            +
            webencodings==0.5.1
         | 
| 130 | 
            +
            websocket-client==1.8.0
         | 
| 131 | 
            +
            widgetsnbextension==4.0.13
         | 
| 132 | 
            +
            wsproto==1.2.0
         | 
    	
        scrape-content.ipynb
    CHANGED
    
    | @@ -207,6 +207,7 @@ | |
| 207 | 
             
                "    if not element.contents:\n",
         | 
| 208 | 
             
                "        return 0, 0\n",
         | 
| 209 | 
             
                "\n",
         | 
|  | |
| 210 | 
             
                "    child_signatures = [\n",
         | 
| 211 | 
             
                "        get_element_signature(child)\n",
         | 
| 212 | 
             
                "        for child in element.find_all(recursive=False)\n",
         | 
| @@ -216,9 +217,10 @@ | |
| 216 | 
             
                "    if not child_signatures:\n",
         | 
| 217 | 
             
                "        return 0, 0\n",
         | 
| 218 | 
             
                "\n",
         | 
|  | |
| 219 | 
             
                "    signature_counts = Counter(child_signatures)\n",
         | 
| 220 | 
             
                "    most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
         | 
| 221 | 
            -
                "    similarity_score = most_common_count / len(child_signatures) | 
| 222 | 
             
                "\n",
         | 
| 223 | 
             
                "    return similarity_score, most_common_count\n",
         | 
| 224 | 
             
                "\n",
         | 
|  | |
| 207 | 
             
                "    if not element.contents:\n",
         | 
| 208 | 
             
                "        return 0, 0\n",
         | 
| 209 | 
             
                "\n",
         | 
| 210 | 
            +
                "    # Get signatures for all direct children that are elements (have a tag name)\n",
         | 
| 211 | 
             
                "    child_signatures = [\n",
         | 
| 212 | 
             
                "        get_element_signature(child)\n",
         | 
| 213 | 
             
                "        for child in element.find_all(recursive=False)\n",
         | 
|  | |
| 217 | 
             
                "    if not child_signatures:\n",
         | 
| 218 | 
             
                "        return 0, 0\n",
         | 
| 219 | 
             
                "\n",
         | 
| 220 | 
            +
                "    # Count how many times each signature appears and get the most common one\n",
         | 
| 221 | 
             
                "    signature_counts = Counter(child_signatures)\n",
         | 
| 222 | 
             
                "    most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
         | 
| 223 | 
            +
                "    similarity_score = most_common_count / len(child_signatures)\n",
         | 
| 224 | 
             
                "\n",
         | 
| 225 | 
             
                "    return similarity_score, most_common_count\n",
         | 
| 226 | 
             
                "\n",
         | 
