# Setup
Install basic dependenciesx

In [975]:
import pandas as pd
import numpy as np
from IPython.display import Markdown, display


# Display settings for pandas
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)


How many students per team?

In [976]:
STUDENTS_PER_TEAM = 5

A blank dataframe that will hold each student's assigned project

In [977]:
df_assignments = pd.DataFrame(columns=['email', 'project', 'own'])

## Open data file

In [978]:
INPUT_FILE = 'data/project_preferences_fall2025.csv'
OUTPUT_FILE = 'data/project_assignments-v2.csv'

In [979]:
df_votes = pd.read_csv(INPUT_FILE)


Simplify column names and remove unnecessary columns.

In [980]:
df_votes = df_votes.rename(columns={
    'Email Address': 'email',
    'First choice': 'first',
    'Second choice': 'second',
    'Third choice': 'third',
    'Are any of these choices your own project proposal?': 'voted_for_self'
})[['email', 'first', 'second', 'third', 'voted_for_self']]

Remove all before a dash and trim whitespace.

In [981]:
for col in ['first', 'second', 'third']:
    df_votes[col] = df_votes[col].str.split('-').str[0].str.strip()

Convert all project names to lowercase.

In [982]:
for col in ['first', 'second', 'third']:
    df_votes[col] = df_votes[col].str.lower()

In [983]:
df_votes.sample(5)

Unnamed: 0,email,first,second,third,voted_for_self
48,ae2508@nyu.edu,fithub,cartranker,gymflow nyu,No - I did not select my own project proposal ...
24,sd5113@nyu.edu,cash me if you can,dungeons and distractions,toberead,No - I did not select my own project proposal ...
100,sp7007@nyu.edu,taskbank,nutrismart,rirf,Yes - First choice
59,las9963@nyu.edu,einstein,roomiehub,vodtracker,Yes - First choice
103,ab11578@nyu.edu,taskbank,nutrismart,rirf,Yes - First choice


## Determine project names

In [984]:
project_names = pd.Series(pd.unique(df_votes[['first', 'second', 'third']].values.ravel()))

In [985]:
project_names = project_names.str.lower()

In [986]:
project_names.describe()

count             52
unique            52
top       rendezvous
freq               1
dtype: object

In [987]:
project_names.sample(3)

44    flavors & fortunes
27          project crux
21              dinewise
dtype: object

## Count votes

Total number of first, second, and third choice votes for each project.

In [988]:
def get_remaining_vote_counts(df_votes):
    vote_counts = pd.DataFrame({
        'project': project_names,
        'first_count': [df_votes['first'].str.lower().eq(name).sum() for name in project_names],
        'second_count': [df_votes['second'].str.lower().eq(name).sum() for name in project_names],
        'third_count': [df_votes['third'].str.lower().eq(name).sum() for name in project_names]
    })
    vote_counts['total'] = vote_counts['first_count'] + vote_counts['second_count'] + vote_counts['third_count']

    vote_counts.sort_values(
        by=['first_count', 'second_count', 'third_count'],
        ascending=[False, False, False]
    )
    return vote_counts

In [989]:
vote_counts = get_remaining_vote_counts(df_votes)
vote_counts

Unnamed: 0,project,first_count,second_count,third_count,total
0,rendezvous,4,1,7,12
1,tbd,0,2,2,4
2,next quad,4,4,0,8
3,kitchen helper,1,0,1,2
4,commyounity,3,2,5,10
5,study space finder @ nyu,3,6,4,13
6,careconnect,2,1,1,4
7,cartranker,1,3,3,7
8,cardwise automations,2,5,2,9
9,pocketbudget,3,3,2,8


Identify projects with insufficient votes

In [990]:
# Identify projects with insufficient total votes
vote_counts = get_remaining_vote_counts(df_votes)
insufficient_projects = vote_counts[vote_counts['total'] < STUDENTS_PER_TEAM][['project', 'total']]
insufficient_projects


Unnamed: 0,project,total
1,tbd,4
3,kitchen helper,2
6,careconnect,4
19,fishdex,3
30,campusconnect,3
31,spin!,4
33,engage 2.0,3
37,sanpvite,2
43,over budget,3
44,flavors & fortunes,1


Remove those projects with insufficient votes from the vote counts.

In [991]:

# Remove these projects from vote_counts
vote_counts = vote_counts[~vote_counts['project'].isin(insufficient_projects['project'])].reset_index(drop=True)
vote_counts

Unnamed: 0,project,first_count,second_count,third_count,total
0,rendezvous,4,1,7,12
1,next quad,4,4,0,8
2,commyounity,3,2,5,10
3,study space finder @ nyu,3,6,4,13
4,cartranker,1,3,3,7
5,cardwise automations,2,5,2,9
6,pocketbudget,3,3,2,8
7,tasklatte,2,2,7,11
8,roulette go,3,2,1,6
9,moodsphere,5,2,1,8


Remove mention of those projects from df_votes.

In [992]:
for project in insufficient_projects:
    for col in ['first', 'second', 'third']:
        df_votes.loc[df_votes[col] == project, col] = np.nan

df_votes.sample(10)


Unnamed: 0,email,first,second,third,voted_for_self
105,dhh5829@nyu.edu,rendezvous,upstore,gesturetalk,Yes - First choice
22,cp3588@nyu.edu,gesturetalk,roomiehub,engage 2.0,Yes - First choice
91,sk10741@nyu.edu,camp,next quad,the language café,Yes - First choice
25,amp10098@nyu.edu,moodsphere,gymflow nyu,strideai,Yes - First choice
28,asm8879@nyu.edu,camp,roomiehub,tickerpickr,Yes - First choice
115,sm10454@nyu.edu,nourishpet,gymflow nyu,kitchen helper,Yes - First choice
56,mmc10050@nyu.edu,tickerpickr,cardwise automations,pocketbudget,No - I did not select my own project proposal ...
63,mk9014@nyu.edu,fithub,tickerpickr,cash me if you can,No - I did not select my own project proposal ...
118,ahc9434@nyu.edu,project crux,musi,nourishpet,Yes - First choice
9,lst8401@nyu.edu,tickerpickr,nourishpet,fishdex,Yes - First choice


## New method

In [993]:
def get_project_votes(df_votes, project):
    """
    Returns a dataframe of all rows where the first, second, or third choices were the given project.
    Orders results according to vote priority.    

    """
    df = df_votes.copy()
    df['sort_priority'] = np.select(
        [
            # first choice and wrote it themselves
            (df['first'] == project) & (df['voted_for_self'] == 'Yes - First choice'),
            # first choice but didn't write it
            (df['first'] == project) & (df['voted_for_self'] != 'Yes - First choice'),
            # second choice and wrote it themselves
            (df['second'] == project) & (df['voted_for_self'] == 'Yes - Second choice'),
            # second choice but didn't write it
            (df['second'] == project) & (df['voted_for_self'] != 'Yes - Second choice'),
            # third choice and wrote it themselves
            (df['third'] == project) & (df['voted_for_self'] == 'Yes - Third choice'),
            # third choice but didn't write it
            (df['third'] == project) & (df['voted_for_self'] != 'Yes - Third choice')
        ],
        [6, 5, 4, 3, 2, 1],
        default=0
    )
    mask = (
        (df['first'] == project) |
        (df['second'] == project) |
        (df['third'] == project)
    )
    return df[mask].sort_values(by='sort_priority', ascending=False).reset_index(drop=True)

In [994]:
# a function to check whether a team is full
def is_team_full(df_assignments, project):
    """
    Check if the given project team in df_assignments has reached the maximum number of students.
    Returns True if the team is full, otherwise False.
    """
    return df_assignments[df_assignments['project'] == project].shape[0] >= STUDENTS_PER_TEAM


In [995]:
# Assign students to projects based on their votes and self-proposal status

assigned_emails = set(df_assignments['email'])
projects = vote_counts['project'].tolist()

for priority in range(6):
    for project in projects:
        if is_team_full(df_assignments, project):
            continue
        # Select students based on priority
        if priority == 0:
            mask = (
                (df_votes['first'] == project) &
                (df_votes['voted_for_self'] == 'Yes - First choice')
            )
        elif priority == 1:
            mask = (
                (df_votes['first'] == project) &
                (df_votes['voted_for_self'] != 'Yes - First choice')
            )
        elif priority == 2:
            mask = (
                (df_votes['second'] == project) &
                (df_votes['voted_for_self'] == 'Yes - Second choice')
            )
        elif priority == 3:
            mask = (
                (df_votes['second'] == project) &
                (df_votes['voted_for_self'] != 'Yes - Second choice')
            )
        elif priority == 4:
            mask = (
                (df_votes['third'] == project) &
                (df_votes['voted_for_self'] == 'Yes - Third choice')
            )
        elif priority == 5:
            mask = (
                (df_votes['third'] == project) &
                (df_votes['voted_for_self'] != 'Yes - Third choice')
            )
        candidates = df_votes[mask & (~df_votes['email'].isin(assigned_emails))]
        for _, row in candidates.iterrows():
            if is_team_full(df_assignments, project):
                break
            df_assignments.loc[len(df_assignments)] = {
                'email': row['email'],
                'project': project,
                'own': (
                    (priority == 0) or
                    (priority == 2) or
                    (priority == 4)
                )
            }
            assigned_emails.add(row['email'])

In [996]:

# Function to redistribute remaining votes
df_assignments.sort_values(by=['project', 'own', 'email'])

Unnamed: 0,email,project,own
33,bb3621@nyu.edu,agrilink,True
31,bg2696@nyu.edu,agrilink,True
32,gmo6996@nyu.edu,agrilink,True
44,ak10747@nyu.edu,camp,True
43,asm8879@nyu.edu,camp,True
...,...,...,...
60,zs1329@nyu.edu,toberead,True
63,am13367@nyu.edu,upstore,True
61,so2426@nyu.edu,upstore,True
62,ss17886@nyu.edu,upstore,True


## Eliminate teams with too few members

All teams by number of members

In [997]:
team_counts = df_assignments['project'].value_counts().sort_values(ascending=False)
print(team_counts)

project
swapbay                      5
musi                         5
cash me if you can           5
taskbank                     5
moodsphere                   5
gesturetalk                  5
roomiehub                    5
nourishpet                   5
camp                         5
tickerpickr                  5
commyounity                  5
nutrismart                   4
instaskill                   4
rendezvous                   4
next quad                    4
strideai                     4
profpick                     4
tasklatte                    3
upstore                      3
study space finder @ nyu     3
cardwise automations         3
pocketbudget                 3
toberead                     3
einstein                     3
roulette go                  3
agrilink                     3
cartranker                   3
dungeons and distractions    2
the language café            2
project crux                 2
rirf                         1
supermix                     1


Too few votes

In [998]:
insufficient_projects = team_counts[team_counts < STUDENTS_PER_TEAM]
insufficient_projects

project
nutrismart                   4
instaskill                   4
rendezvous                   4
next quad                    4
strideai                     4
profpick                     4
tasklatte                    3
upstore                      3
study space finder @ nyu     3
cardwise automations         3
pocketbudget                 3
toberead                     3
einstein                     3
roulette go                  3
agrilink                     3
cartranker                   3
dungeons and distractions    2
the language café            2
project crux                 2
rirf                         1
supermix                     1
dinewise                     1
vodtracker                   1
Name: count, dtype: int64

In [999]:
# Replace project names in df_assignments that are in insufficient_projects with 'unassigned'
df_assignments.loc[df_assignments['project'].isin(insufficient_projects.index), 'project'] = np.nan
df_assignments

Unnamed: 0,email,project,own
0,nb3733@nyu.edu,,True
1,dhh5829@nyu.edu,,True
2,cb5330@nyu.edu,,True
3,ns5745@nyu.edu,,True
4,zs2838@nyu.edu,,True
...,...,...,...
114,nu2061@nyu.edu,,False
115,jl12397@nyu.edu,musi,False
116,yl10139@nyu.edu,roomiehub,False
117,yd2833@nyu.edu,cash me if you can,False


In [1000]:
team_counts = df_assignments['project'].value_counts().sort_values(ascending=False)
print(team_counts)

project
commyounity           5
moodsphere            5
gesturetalk           5
nourishpet            5
tickerpickr           5
swapbay               5
camp                  5
roomiehub             5
taskbank              5
cash me if you can    5
musi                  5
Name: count, dtype: int64


In [1001]:
# Find rows with null project
null_project_mask = df_assignments['project'].isnull()
num_nulls = null_project_mask.sum()
num_nulls

np.int64(64)

In [1002]:
import math

num_groups = math.ceil(num_nulls / STUDENTS_PER_TEAM)
num_groups

13

Generate random team names for those unassigned so far

In [1003]:
import nltk
import random

nltk.download('words')
from nltk.corpus import words

word_list = [w for w in words.words() if len(w) > 4]
random_names = [
    "random_" + random.choice(word_list).lower()
    for _ in range(num_groups)
]
random_names

[nltk_data] Downloading package words to
[nltk_data]     /Users/foobarstein/nltk_data...
[nltk_data]   Package words is already up-to-date!


['random_sydneian',
 'random_landlock',
 'random_caelian',
 'random_alumiferous',
 'random_planterdom',
 'random_grandeeism',
 'random_hairdo',
 'random_blottesque',
 'random_unkinger',
 'random_perplication',
 'random_truthteller',
 'random_unruinable',
 'random_plumbojarosite']

Assign the random team names to those students who are unassigned.

In [1004]:
# Assign random project names in groups
null_indices = df_assignments[null_project_mask].index.tolist()
for i, idx in enumerate(null_indices):
    group_num = i // STUDENTS_PER_TEAM
    df_assignments.at[idx, 'project'] = random_names[group_num]
df_assignments

Unnamed: 0,email,project,own
0,nb3733@nyu.edu,random_sydneian,True
1,dhh5829@nyu.edu,random_sydneian,True
2,cb5330@nyu.edu,random_sydneian,True
3,ns5745@nyu.edu,random_sydneian,True
4,zs2838@nyu.edu,random_sydneian,True
...,...,...,...
114,nu2061@nyu.edu,random_plumbojarosite,False
115,jl12397@nyu.edu,musi,False
116,yl10139@nyu.edu,roomiehub,False
117,yd2833@nyu.edu,cash me if you can,False


## Save assignments
Save to CSV data file.

In [1005]:
df_assignments = df_assignments.sort_values(by=['project', 'own', 'email'], ascending=[True, False, True])
df_assignments.to_csv(OUTPUT_FILE, index=False)