In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://lightning.ai/docs/pytorch/latest/starter/introduction.html"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    div_content = soup.find('div', class_='rst-content')

    if div_content:
        sections = div_content.find_all('section')

        for section in sections[1:]:

            section_content = section.get_text()
            print(section_content)
            print('-------------------')
    else:
        print("Div element with class 'rst-content' not found. Check the HTML structure of the page.")

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


1: Install PyTorch Lightning¶

For pip users
pip install lightning



For conda users
conda install lightning -c conda-forge



Or read the advanced install guide


-------------------

2: Define a LightningModule¶
A LightningModule enables your PyTorch nn.Module to play together in complex ways inside the training_step (there is also an optional validation_step and test_step).
import os
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
import lightning as L

# define any number of nn.Modules (or use your current ones)
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))


# define the LightningModule
class LitAutoEncoder(L.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch

In [None]:
from urllib.parse import urlparse, urlunparse,urljoin

In [None]:
def remove_fragment(url):
    parser_url = urlparse(url)

    new_url = urlunparse(parser_url._replace(fragment=""))
    return new_url

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

crawled_urls = set()


# Function to fetch and extract links from a page
def get_links(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            links = []
            for a_tag in soup.find_all("a", href=True):
                link = a_tag["href"]
                links.append(link)
            return links
    except Exception as e:
        print(f"Failed to fetch links from {url}: {e}")
    return []


# Function to recursively fetch links within the same domain
def fetch_links_recursive(base_url, current_url, visited_urls, max_depth=4):
    if current_url in visited_urls or max_depth == 0:
        return

    visited_urls.add(current_url)
    links = get_links(current_url)
    for link in links:
        absolute_url = urljoin(current_url, link)
        parsed_url = urlparse(absolute_url)
        if parsed_url.netloc == base_url.netloc:
            crawled_urls.add(remove_fragment(absolute_url))
            fetch_links_recursive(base_url, absolute_url, visited_urls, max_depth - 1)

In [None]:
base_url = urlparse(
    "https://lightning.ai/docs/pytorch/latest/starter/introduction.html"
)
visited_urls = set()
fetch_links_recursive(base_url, base_url.geturl(), visited_urls)

In [None]:
len(crawled_urls)

275

In [None]:
crawled_urls

{'https://lightning.ai/docs/fabric/',
 'https://lightning.ai/docs/pytorch/latest/_images/custom_loop.png',
 'https://lightning.ai/docs/pytorch/latest/_images/ddp.gif',
 'https://lightning.ai/docs/pytorch/latest/_modules/lightning/fabric/utilities/throughput.html',
 'https://lightning.ai/docs/pytorch/latest/_modules/lightning/pytorch/core/module.html',
 'https://lightning.ai/docs/pytorch/latest/_modules/lightning/pytorch/trainer/trainer.html',
 'https://lightning.ai/docs/pytorch/latest/_sources/accelerators/gpu.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/accelerators/tpu.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/advanced/speed.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/api_references.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/common/checkpointing.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/common/index.rst.txt',
 'https://lightning.ai/docs/pytorch/latest/_sources/common/lightning_module.rst.txt

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


def extract_sections_to_csv(url, output_file):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the div element with class "rst-content"
        div_content = soup.find("div", class_="rst-content")

        if div_content:
            # Find all section tags within the div_content
            sections = div_content.find_all("section")

            # Create a list to store the sections
            section_contents = []

            for section in sections[1:]:
                # Extract the content of each section
                section_content = section.get_text()
                section_contents.append(section_content)

            # Create a DataFrame with URL and Section Content columns
            df = pd.DataFrame(
                {
                    "URL": [url] * len(section_contents),
                    "Section Content": section_contents,
                }
            )

            # Save the DataFrame to a CSV file
            df.to_csv(output_file, index=False)

        else:
            print(
                "Div element with class 'rst-content' not found. Check the HTML structure of the page."
            )

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")


# Example usage
url = "https://lightning.ai/docs/pytorch/latest"
output_file = "sections.csv"
extract_sections_to_csv(url, output_file)

In [None]:
!mkdir crawled

In [None]:
from rich.progress import track

for i, url in enumerate(track(crawled_urls)):
    output_file = f"crawled/{i}.csv"
    extract_sections_to_csv(url, output_file)

Output()

In [None]:
!pip install -q langchain chromadb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m593.7/593.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.6/72.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.9 MB/s[0m eta [

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
from glob import glob
from chromadb.utils import embedding_functions

In [None]:
import chromadb
chroma_client = chromadb.PersistentClient(path="db")

collection = chroma_client.create_collection(name="test")
collection = chroma_client.get_collection(name="test")

In [None]:
csvs = glob("crawled/*.csv")

In [None]:
from rich.progress import track
from rich import print
from os.path import basename

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_trans

In [None]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="BAAI/llm-embedder")
data=[]
for csv in track(csvs):
  df=pd.read_csv(csv)
  if len(df)==0:
    continue
  urls, documents = df["URL"].tolist(), df["Section Content"].tolist()
  embeddings = sentence_transformer_ef(documents)
  assert len(urls)==len(documents) == len(embeddings)
  base = basename(urls[0])
  collection.add(
      embeddings=embeddings,
      documents=documents,
      metadatas = [{"source":url} for url in urls],
      ids = [f"{base}_{i}" for i in range(len(documents))]
  )

Output()

In [32]:
query = """How would I add an input argument to the lightning module core function and use them from the trainer?
something like this:
def predict_step(self, batch, batch_idx, n_tokens=32)"""

query = "NeurIPS 2023 LLM Efficiency Challenge Quickstart Guide"

query_texts=[query]
query_embeddings = sentence_transformer_ef(query_texts)
result = collection.query(query_embeddings=query_embeddings, n_results=2)

In [33]:
result

{'ids': [['callbacks.html_45', 'lightning_module.html_97']],
 'distances': [[0.31527483463287354, 0.321233868598938]],
 'metadatas': [[{'source': 'https://lightning.ai/docs/pytorch/latest/extensions/callbacks.html'},
   {'source': 'https://lightning.ai/docs/pytorch/latest/common/lightning_module.html'}]],
 'embeddings': None,
 'documents': [['\non_before_optimizer_step¶\n\n\nCallback.on_before_optimizer_step(trainer, pl_module, optimizer)[source]\nCalled before optimizer.step().\n\nReturn type:\nNone\n\n\n\n',
   '\noptimizer_step¶\n\n\nLightningModule.optimizer_step(epoch, batch_idx, optimizer, optimizer_closure=None)[source]\nOverride this method to adjust the default way the Trainer calls\nthe optimizer.\nBy default, Lightning calls step() and zero_grad() as shown in the example.\nThis method (and zero_grad()) won’t be called during the accumulation phase when\nTrainer(accumulate_grad_batches != 1). Overriding this hook has no benefit with manual optimization.\n\nParameters:\n\nepoc

In [34]:
def read_md(file):
    with open(file, "r") as fr:
        data =  fr.read()
    return data

files = glob("crawled/tutorials/*.md")
data = []
for file in track(files):
    document = read_md(file)
    embedding = sentence_transformer_ef([document])
    base = basename(file)
    a = collection.add(
            embeddings=embedding,
            documents=[document],
            metadatas=[{"source": base}],
            ids=[base]
        )

Output()

In [None]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.19.tar.gz (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.19-cp310-cp310-manylinux_2_35_x86_64.whl size=1978774 sha256=47d53a8c7f3c84e05ae637ca3a9369e04fd475bd1154baac102653d280445430
  Stored in directory: /root/.cache/pip/wheels/c7/39/87/39c101006774e09d62a2210a52cee6e93e390ee8eda5e36a6f
Successfully built llama-cpp-python
Installing collected packages: llama-cpp-python
Successfully installed llama-cpp-python-0.2.19


In [None]:
from llama_cpp import Llama

In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf

--2023-11-27 10:58:42--  https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 65.8.178.118, 65.8.178.12, 65.8.178.93, ...
Connecting to huggingface.co (huggingface.co)|65.8.178.118|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/a2/c6/a2c63827017d81931777a84eb0e153b8b34902e46289c684623d88c2e6243782/ce6253d2e91adea0c35924b38411b0434fa18fcb90c52980ce68187dbcbbe40c?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mistral-7b-v0.1.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-v0.1.Q4_K_M.gguf%22%3B&Expires=1701341922&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMTM0MTkyMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9hMi9jNi9hMmM2MzgyNzAxN2Q4MTkzMTc3N2E4NGViMGUxNTNiOGIzNDkwMmU0NjI4OWM2ODQ2MjNkODhjMmU2MjQzNzgyL2NlNjI1M2QyZTkxYWRlYTBjMzU5MjRiMzg0MTFiMDQzNGZhMThmY2

In [30]:
from contextlib import redirect_stdout, redirect_stderr
from os import devnull
from llama_cpp import Llama
from contextlib import suppress

# Redirect stdout and stderr to /dev/null
with redirect_stdout(open(devnull, 'w')):
    with redirect_stderr(open(devnull, 'w')):
        llm = Llama(model_path="mistral-7b-v0.1.Q4_K_M.gguf", main_gpu=1, n_ctx=1028, verbose=False)


In [35]:
prompt = f"""Answer the given question based on the context. If you don't know the answer then respond with I don't know.
Context: {result['documents'][0][0][1024:]}
----
Q: {query}
A:"""

In [36]:
print(prompt)

In [47]:
output = llm(prompt, echo=False, stop=["\n\n"], temperature=0.1,)

In [46]:
llm(
    "Format the question and do not deviate too much - How do I read the results of the Learning Rate finder when using it like a callback? ",
    echo=False, stop=["\n\n"],
    temperature=0.1,
)

{'id': 'cmpl-8f073b1c-1bee-4343-a6a8-436e6b239feb',
 'object': 'text_completion',
 'created': 1701083777,
 'model': 'mistral-7b-v0.1.Q4_K_M.gguf',
 'choices': [{'text': '1. What is the difference between the learning rate finder and the learning rate scheduler? 2. When should I use one over the other? 3. How do I read the results of the learning rate finder when using it like a callback?',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 32, 'completion_tokens': 55, 'total_tokens': 87}}

In [48]:
print(output)

In [49]:
print(output["choices"][0]["text"])