ReadmeandLicenseGithub / github_analyzer.py
PraneshJs's picture
added strip() to prevent new line error
eb394c6 verified
raw
history blame
1.82 kB
from github import Github
from urllib.parse import urlparse
import os
from dotenv import load_dotenv
load_dotenv()
def extract_repo_name(url):
path = urlparse(url).path.strip("/")
return path.split("/")[-2], path.split("/")[-1]
def is_important_file(path):
ignored_dirs = ["node_modules", "test", "__tests__", ".git", "build", ".next", ".vscode"]
important_dirs = ["src", "components", "pages", "hooks", "controller", "service"]
important_exts = [".py", ".js", ".ts", ".jsx", ".tsx", ".java"]
if any(ignored in path for ignored in ignored_dirs):
return False
if any(dir_ in path for dir_ in important_dirs) and any(path.endswith(ext) for ext in important_exts):
return True
if path.count("/") <= 1 and any(path.endswith(ext) for ext in important_exts):
return True
return False
def get_filtered_file_contents(repo, path=""):
contents = repo.get_contents(path)
all_files = {}
for content in contents:
if content.type == "dir":
all_files.update(get_filtered_file_contents(repo, content.path))
else:
if is_important_file(content.path):
try:
file_data = content.decoded_content.decode("utf-8")
all_files[content.path] = file_data
except Exception as e:
all_files[content.path] = f"Unable to read: {e}"
return all_files
def analyze_repo(repo_url):
g = Github(os.getenv("GITHUB_TOKEN").strip())
owner, name = extract_repo_name(repo_url)
repo = g.get_repo(f"{owner}/{name}")
info = {
"name": repo.name,
"description": repo.description,
"topics": repo.get_topics(),
"files": get_filtered_file_contents(repo),
"url": repo.html_url
}
return info