Spaces:

Jagadeesh2411
/

NLP

Running

App Files Files Community

NLP / pages /1_Basics_of_NLP.py

Jagadeesh2411

Rename pages/1_Introduction.py to pages/1_Basics_of_NLP.py

558ee81 verified 3 months ago

raw

history blame contribute delete

7.23 kB

	import streamlit as st
	import nltk
	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

	# Download NLTK data
	nltk.download("punkt")
	nltk.download("stopwords")
	nltk.download("wordnet")

	# Streamlit app configuration
	st.set_page_config(page_title="NLP Basics for Beginners", page_icon="🤖", layout="wide")
	st.title("🤖 NLP Basics for Beginners")
	st.markdown(
	"""
	Welcome to the NLP Basics App!
	Here, you'll learn about the foundational concepts of Natural Language Processing (NLP) through interactive examples.
	Let's explore:
	- What is NLP? Its applications and use cases.
	- Text Representation Basics: Tokens, sentences, words, stopwords, lemmatization, stemming.
	- Vectorization Techniques: Bag of Words (BoW) and TF-IDF.
	"""
	)

	# Divider
	st.markdown("---")

	# Sidebar Navigation
	st.sidebar.title("Navigation")
	sections = ["Introduction to NLP", "Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"]
	selected_section = st.sidebar.radio("Choose a section", sections)

	# Input Text Box
	st.sidebar.write("### Enter Text to Analyze:")
	text_input = st.sidebar.text_area("Input your text here:", height=150, placeholder="Type or paste some text here...")

	if not text_input.strip():
	st.sidebar.warning("Please enter some text to explore NLP concepts.")

	# Section 1: Introduction to NLP
	if selected_section == "Introduction to NLP":
	st.header("💡 What is NLP?")
	st.write(
	"""
	Natural Language Processing (NLP) is a field of Artificial Intelligence (AI) focused on the interaction between computers and human language.
	It enables machines to understand, interpret, and generate human language.

	### Applications of NLP:
	- Chatbots: AI-powered conversational agents (e.g., Siri, Alexa).
	- Text Summarization: Extracting important information from lengthy documents.
	- Machine Translation: Translating text between languages (e.g., Google Translate).
	- Sentiment Analysis: Understanding opinions in social media or reviews (positive/negative/neutral).
	"""
	)
	st.image("https://miro.medium.com/max/1400/1*H0qcbsUCWkE7O__q2XkKYA.png", caption="Applications of NLP", use_column_width=True)

	# Section 2: Tokenization
	if selected_section == "Tokenization":
	st.header("🔤 Tokenization")
	st.write(
	"""
	Tokenization is the process of breaking down text into smaller units, like sentences or words.
	It is a critical first step in many NLP tasks.

	### Types of Tokenization:
	1. Sentence Tokenization: Splitting text into sentences.
	2. Word Tokenization: Splitting text into individual words (tokens).

	Example Input: "I love NLP. It's amazing!"
	Sentence Tokens: ["I love NLP.", "It's amazing!"]
	Word Tokens: ["I", "love", "NLP", ".", "It", "'s", "amazing", "!"]
	"""
	)
	if text_input.strip():
	st.subheader("Try Tokenization on Your Input Text")
	st.write("Sentence Tokenization:")
	sentences = sent_tokenize(text_input)
	st.write(sentences)

	st.write("Word Tokenization:")
	words = word_tokenize(text_input)
	st.write(words)

	# Section 3: Stopwords
	if selected_section == "Stopwords":
	st.header("🛑 Stopwords")
	st.write(
	"""
	Stopwords are common words (e.g., "and", "is", "the") that add little meaning to text and are often removed in NLP tasks.

	Removing stopwords helps focus on the essential words in a text.
	For example:
	Input: "This is an example of stopwords removal."
	Output: ["example", "stopwords", "removal"]
	"""
	)
	if text_input.strip():
	st.subheader("Remove Stopwords from Your Input Text")
	stop_words = set(stopwords.words("english"))
	words = word_tokenize(text_input)
	filtered_words = [word for word in words if word.lower() not in stop_words]
	st.write("Original Words:", words)
	st.write("Words after Stopwords Removal:", filtered_words)

	# Section 4: Lemmatization & Stemming
	if selected_section == "Lemmatization & Stemming":
	st.header("🌱 Lemmatization and Stemming")
	st.write(
	"""
	### Stemming:
	Reduces words to their root form by chopping off prefixes/suffixes.
	Example: "running" → "run", "studies" → "studi"

	### Lemmatization:
	Returns the base (dictionary) form of a word using context.
	Example: "running" → "run", "better" → "good"
	"""
	)
	if text_input.strip():
	st.subheader("Apply Stemming and Lemmatization")
	words = word_tokenize(text_input)

	ps = PorterStemmer()
	stemmed_words = [ps.stem(word) for word in words]
	st.write("Stemmed Words:", stemmed_words)

	lemmatizer = WordNetLemmatizer()
	lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
	st.write("Lemmatized Words:", lemmatized_words)

	# Section 5: Bag of Words (BoW)
	if selected_section == "Bag of Words (BoW)":
	st.header("📦 Bag of Words (BoW)")
	st.write(
	"""
	Bag of Words (BoW) is a text representation technique that converts text into a vector of word frequencies.
	It ignores word order but considers the occurrence of words.

	### Example:
	Input Texts:
	1. "I love NLP."
	2. "NLP is amazing!"

	BoW Matrix:
	\| \| I \| love \| NLP \| is \| amazing \|
	\|------\|---\|------\|-----\|----\|---------\|
	\| Text1\| 1 \| 1 \| 1 \| 0 \| 0 \|
	\| Text2\| 0 \| 0 \| 1 \| 1 \| 1 \|
	"""
	)
	if text_input.strip():
	st.subheader("Generate BoW for Your Input Text")
	vectorizer = CountVectorizer()
	X = vectorizer.fit_transform([text_input])
	st.write("BoW Matrix:")
	st.write(X.toarray())
	st.write("Feature Names (Words):")
	st.write(vectorizer.get_feature_names_out())

	# Section 6: TF-IDF
	if selected_section == "TF-IDF":
	st.header("📊 TF-IDF (Term Frequency-Inverse Document Frequency)")
	st.write(
	"""
	TF-IDF is a statistical measure that evaluates how important a word is to a document in a collection of documents.
	It balances the frequency of a word with its rarity across documents.

	### Formula:
	- Term Frequency (TF): How often a word appears in a document.
	- Inverse Document Frequency (IDF): Log of the total documents divided by the number of documents containing the word.

	Example:
	- "NLP is amazing."
	- "I love NLP."

	TF-IDF assigns higher weights to rare but significant words.
	"""
	)
	if text_input.strip():
	st.subheader("Generate TF-IDF for Your Input Text")
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform([text_input])
	st.write("TF-IDF Matrix:")
	st.write(tfidf_matrix.toarray())
	st.write("Feature Names (Words):")
	st.write(tfidf_vectorizer.get_feature_names_out())

	# Footer
	st.markdown("---")
	st.markdown(
	"""
	<center>
	<p style='font-size:14px;'>© 2024 NLP Basics App. All Rights Reserved.</p>
	</center>
	""",
	unsafe_allow_html=True,
	)