my tf-Idf vectorozer not working.
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv('UpdatedResumeDataSet.csv')
def clean_text(text):
text = re.sub(r'\W', ' ', text) # Remove all non-word characters
text = re.sub(r'\s+', ' ', text) # Remove extra spaces
return text.lower()
data['cleaned_resume'] = data['Resume'].apply(clean_text)
#[email protected]
#nltk.data.find('stopword')
#nltk.download('stopwords')
#nltk.download('punkt')
stop_words = set(stopwords.words('english'))
def tokenize_and_remove_stopwords(text):
words = nltk.word_tokenize(text)
words = [word for word in words if word not in stop_words]
return words
data['tokenized_text'] = data['cleaned_resume'].apply(tokenize_and_remove_stopwords)
print(data['tokenized_text'])
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['tokenized_text'])
for i in range(len(data['tokenized_text'])):
tfidf_matrix[i] = tfidf_vectorizer.fit_transform(data ['tokenized_text'] [1])
print(tfidf_matrix)
print(tfidf_vectorizer.vocabulary_)