# Install AutoGluon
!pip install autogluon
import pandas as pd
from autogluon.tabular import TabularPredictor
# Load training data
file_path ='/content/drive/MyDrive/Colab Notebooks/train.csv'
df = pd.read_csv(file_path)# View data basic information
df_info = df.info()
df_head = df.head()
df_shape = df.shape
print(df_shape)print(df_head)# Target variable (classification)
label ='Category'
label2 ='Misconception'# Combine text features
df['Misconception']= df['Misconception'].fillna("NA")
df['combined_text']= df['QuestionText'].fillna('')+' '+ df['MC_Answer'].fillna('')+' '+ df['StudentExplanation'].fillna('')# Create sub-dataset for training Category
df_train = df[['combined_text','QuestionId', label]].dropna()# AutoGluon training for Category
predictor = TabularPredictor(label).fit(
num_gpus =1,
train_data=df_train,
time_limit=1800,# Can be set larger, e.g., 3600 seconds
presets='best_quality'# Or 'medium_quality_faster_train')# Evaluate Category predictor
predictor.evaluate(df_train)# Load test data
test_path ='/content/drive/MyDrive/Colab Notebooks/test.csv'
test_data = pd.read_csv(test_path)
test_data['combined_text']= test_data['QuestionText'].fillna('')+' '+ test_data['MC_Answer'].fillna('')+' '+ test_data['StudentExplanation'].fillna('')# Predict probabilities for Category on test data
predictors = predictor.predict_proba(test_data)# Create sub-dataset for training Misconception
df_train_Misconception = df[['combined_text', label2]]# AutoGluon training for Misconception
predictor_Misconception = TabularPredictor(label2).fit(
num_gpus =1,
train_data=df_train_Misconception,
time_limit=1800,# Can be set larger, e.g., 3600 seconds
presets='best_quality'# Or 'medium_quality_faster_train')# Evaluate Misconception predictor
predictor_Misconception.evaluate(df_train_Misconception)# Predict probabilities for Misconception on test data
predictor_Misconceptions = predictor_Misconception.predict_proba(test_data)# Get top 3 categories and misconceptions for the first 5 rows (example)
top_3_catorgray = predictors.head().apply(lambda row: row.nlargest(3).index.tolist(), axis=1)
top_3_misconceptions = predictor_Misconceptions.head().apply(lambda row: row.nlargest(3).index.tolist(), axis=1)print("Top 3 Categories for first 5 rows:")
display(top_3_catorgray)print("\nTop 3 Misconceptions for first 5 rows:")
display(top_3_misconceptions)# Function to get top combinations of category and misconceptionimport itertools
defget_top_combinations(category_probs, misconception_probs, n=3):"""
Calculates the top n combinations of category and misconception based on the product of their probabilities.
Args:
category_probs (pd.Series): Series of probabilities for categories.
misconception_probs (pd.Series): Series of probabilities for misconceptions.
n (int): The number of top combinations to return.
Returns:
list: A list of strings representing the top n combinations in the format 'category:misconception'.
"""
combinations =[]for cat_name, cat_prob in category_probs.nlargest(n).items():for mis_name, mis_prob in misconception_probs.nlargest(n).items():
combinations.append(((cat_name, mis_name), cat_prob * mis_prob))# Sort combinations by probability in descending order
sorted_combinations =sorted(combinations, key=lambda item: item[1], reverse=True)# Get the top n combinations and format them
top_n_combinations =[f"{cat}:{mis}"for(cat, mis), prob in sorted_combinations[:n]]return top_n_combinations
# Apply the function to each row of the prediction DataFrames
top_combinations = predictors.apply(lambda row: get_top_combinations(row, predictor_Misconceptions.iloc[row.name]), axis=1)print("\nTop 3 Category:Misconception combinations:")
display(top_combinations)# Create a new DataFrame to store the results
results_df = pd.DataFrame()# Iterate through test_data and top_combinations
results_df['row_id']= test_data['row_id']
results_df['Category:Misconception']= top_combinations
# Convert the list of combinations to a single string
results_df['Category:Misconception']= results_df['Category:Misconception'].apply(lambda x:', '.join(x))# Display the resulting DataFrameprint("\nResults DataFrame:")
display(results_df)# Save the results to a CSV file
results_df.to_csv('submission.csv', index=False)print("\nSubmission file 'submission.csv' created.")