import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.utils import resample import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer import string # Ensure stopwords and stemmer are available nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() # Assuming ds is your DataFrame # Sample DataFrame for demonstration # ds = pd.read_csv('your_dataset.csv') # Split the dataset into train, validation, and test sets train_df = ds.iloc[:204045] validation_df = ds.iloc[204045:215377] test_df = ds.iloc[215377:] # Text Preprocessing Function def preprocess_text(text): text = text.lower() text = ''.join([char for char in text if char not in string.punctuation]) words = text.split() words = [stemmer.stem(word) for word in words if word not in stop_words] return ' '.join(words) # Apply text preprocessing to the document and summary columns train_df['document'] = train_df['document'].apply(preprocess_text) train_df['summary'] = train_df['summary'].apply(preprocess_text) validation_df['document'] = validation_df['document'].apply(preprocess_text) validation_df['summary'] = validation_df['summary'].apply(preprocess_text) test_df['document'] = test_df['document'].apply(preprocess_text) test_df['summary'] = test_df['summary'].apply(preprocess_text) # Vectorization using TF-IDF (reduces the dimensionality of text data) vectorizer = TfidfVectorizer(max_features=1000) train_vectors = vectorizer.fit_transform(train_df['document']) validation_vectors = vectorizer.transform(validation_df['document']) test_vectors = vectorizer.transform(test_df['document']) # Sample Reduction reduced_train_df = resample(train_df, n_samples=50000, random_state=42) reduced_validation_df = resample(validation_df, n_samples=3000, random_state=42) reduced_test_df = resample(test_df, n_samples=3000, random_state=42) # Display reduced datasets print(f"Reduced Train Set Size: {reduced_train_df.shape}") print(f"Reduced Validation Set Size: {reduced_validation_df.shape}") print(f"Reduced Test Set Size: {reduced_test_df.shape}")