#TF-IDF

15 messages · Page 1 of 1 (latest)

random vapor
#

my tf-Idf vectorozer not working.

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('UpdatedResumeDataSet.csv')


def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.lower()


data['cleaned_resume'] = data['Resume'].apply(clean_text)
#[email protected]
#nltk.data.find('stopword')
#nltk.download('stopwords')
#nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return words

data['tokenized_text'] = data['cleaned_resume'].apply(tokenize_and_remove_stopwords)

print(data['tokenized_text'])

tfidf_vectorizer = TfidfVectorizer(max_features=3000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['tokenized_text'])
for i in range(len(data['tokenized_text'])):
    tfidf_matrix[i] = tfidf_vectorizer.fit_transform(data ['tokenized_text'] [1])

print(tfidf_matrix)
print(tfidf_vectorizer.vocabulary_)
sudden heron
#

Didn't we have this exact question before

random vapor
#

yes I thought I solved it but I'm bCK TO SQUre one

sudden heron
#

Yes you still have the same issue, you're trying to call lower() on a list but you can only do that on strings

random vapor
#

checked it by doing print(type(str))

#

I'm sure it's string

sudden heron
#

What's does print(type(data['tokenized_text'])) show you?

random vapor
#

this would be series

#

let me check

random vapor
#

it's pandas.series

#

<class 'pandas.core.series.Series'>

sudden heron
#

from the docs:

If 'content', the input is expected to be a sequence of items that can be of type string or byte.