Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Edmond/llsc 27 Data Seeding (Matching) #9

Draft
wants to merge 9 commits into
base: rohan-edmond-mayank/matching-algo
Choose a base branch
from
117 changes: 116 additions & 1 deletion backend/matching/data/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,122 @@
# config.py

# Constants for output format choices
OUTPUT_FORMAT_CHOICES = ["dataframe", "csv", "json", "excel"]
OUTPUT_FORMAT_CHOICES = ["dataframe", "csv", "json", "excel", "db"]

# Constants for formats that require a file path
FILE_PATH_REQUIRED_FORMATS = ["csv", "json", "excel"]

OPTIONS_FOR_DATA = ["participant", "volunteer", "matching"]

# Constants for the different types of data that can be generated

############################################################################################################
### DEMOGRAPHICS DATA
############################################################################################################
YES_NO = ["Yes", "No"]
FIRST_NAMES = ["John", "Jane", "Alex", "Taylor", "Sam", "Chris"]
LAST_NAMES = ["Doe", "Smith", "Lee", "Patel", "Brown", "Garcia"]
PROVINCES = [
"Alberta",
"British Columbia",
"Manitoba",
"New Brunswick",
"Newfoundland and Labrador",
"Northwest Territories",
"Nova Scotia",
"Nunavut",
"Ontario",
"Prince Edward Island",
"Quebec",
"Saskatchewan",
"Yukon",
]
LANGUAGES = ["English", "French"]
GENDER_IDENTITIES = [
"Female",
"Male",
"Non-binary",
"Prefer not to answer",
"Prefer to self-describe", # TODO: not sure how to accomidate for these 'other options'
]
PRONOUNS = ["she/her", "he/him", "they/them", "other"]
ETHNIC_GROUPS = [
"Black (including African and Caribbean descent)",
"Middle Eastern, Western or Central Asian",
"Chinese",
"East Asian, excluding Chinese",
"Indigenous person from Canada",
"Latin American",
"South Asian",
"Southeast Asian",
"White/Caucasian",
"Mixed ethnicity",
"Prefer not to answer",
]
MARITAL_STATUSES = ["Single", "Married/Common Law", "Divorced", "Widowed"]


############################################################################################################
### MEDICAL INFORMATION DATA
############################################################################################################
CAREGIVING_TYPES = [
"Parent",
"Sibling",
"Child",
"Spouse",
"Friend",
"Other", # TODO: how do we impl this? (like if the user chooses other)
]
DIAGNOSES = {
"Unknown": ["Unknown"],
"Category 1": [
"Acute Myeloid Leukemia",
"Acute Lymphoblastic Leukemia",
"Acute Promyelocytic leukemia",
"Mixed Phenotype Leukemia",
],
"Category 2": [
"Chronic Lymphocytic Leukemia / Small Lymphocytic Lymphoma",
"Chronic Myeloid Leukemia",
"Hairy Cell Leukemia",
],
"Category 3": [
"Myeloma",
"Hodgin’s Lymphoma",
"Indolent/low grade Non-Hodgkin’s Lymphoma",
"Aggressive/high grade Non-Hodgkin’s Lymphoma",
],
"Category 4": ["Low risk MDS", "High Risk MDS"],
"Category 5": ["Myelofibrosis", "Essential Thrombocythemia", "Polycythemia Vera"],
}

TREATMENTS = [
"Unknown",
"Watch and Wait / Active Surveillance",
"Chemotherapy/immunotherapy",
"Oral Chemotherapy",
"Radiation",
"Maintenance chemotherapy",
"Palliative care",
"Transfusions",
"Autologous Stem Cell Transplant",
"Allogeneic Stem cell Transplant",
"Haplo Stem Cell Transplant",
"CAR-T",
]

EXPERIENCES = [
"Brain Fog",
"Fatigue",
"Fertility Issues",
"Graft vs Host",
"Returning to work after/during treatment",
"Returning to school after/during treatment",
"Speaking to your children about diagnosis",
"Speaking to your family or friends about diagnosis",
"Relapse",
"Anxiety",
"Depression",
"PTSD",
"Side effects from treatment",
]
149 changes: 98 additions & 51 deletions backend/matching/data/data_category/demographics.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,114 @@
import random
import datetime
import re
from faker import Faker
from backend.matching.data.config import (
PROVINCES,
LANGUAGES,
GENDER_IDENTITIES,
PRONOUNS,
ETHNIC_GROUPS,
MARITAL_STATUSES,
YES_NO,
)

fake = Faker() # generic faker
fake_ca = Faker("en_CA") # canadian faker


# TODO: this uses the random and datetime; we can use a seeder like Faker to generate more realistic data
# TODO: have highlighted relevent fields for the matching algorithm
class Demographics:
# TODO: Add more roles and diagnoses (as we go and finalize the survey)
# TODO: can we move these field paramaters to a constants file?

GENDER_IDENTITIES = [
"Female",
"Male",
"Non-binary",
"Prefer not to answer",
"Prefer to self-describe",
]
ETHNIC_GROUPS = [
"Black (including African and Caribbean descent)",
"Middle Eastern",
"East Asian",
"South Asian",
"Southeast Asian",
"Indigenous person from Canada",
"Latin American",
"White",
"Mixed ethnicity",
"Prefer not to answer",
"Another background/Prefer to self describe",
]
PRONOUNS = [
"He/Him",
"She/Her",
"They/Them",
"Ze/Hir",
"Prefer not to answer",
"Other",
]
CITIES = [
"Toronto",
"Vancouver",
"Montreal",
"Calgary",
"Ottawa",
"Edmonton",
"Winnipeg",
"Quebec City",
"Hamilton",
"Halifax",
]
@staticmethod
# temporary function to get random first name
def get_random_first_name():
return fake.first_name()

# temporary function to get random first name
@staticmethod
def get_random_last_name():
return fake.last_name()

@staticmethod
def get_random_date_of_birth(min_age=18, max_age=90):
today = datetime.date.today()
age = random.randint(min_age, max_age)
dob = today.replace(year=today.year - age)
return dob.strftime("%Y-%m-%d")

# temporary function to get random first name
# TODO: not needed for matching algorithm; can use Faker here to emulate better response
@staticmethod
def get_random_email():
return fake.email()

# TODO: not needed for matching algorithm; can use Faker here to emulate better response
@staticmethod
def get_random_phone():
return fake_ca.phone_number()

# TODO: not needed for matching algorithm; can use Faker here to emulate better response
@staticmethod
def get_random_postal_code():
# use the regex part here to make it random
template = "A0A 0A0"

# Use re.sub with specific replacement logic for each position
return re.sub(
r"[A-Z]|\d",
lambda x: random.choice(
"ABCEGHJKLMNPRSTVXY"
if x.start() == 0
else "ABCEGHJKLMNPRSTVWXYZ"
if x.group().isalpha()
else "0123456789"
),
template,
)

# IMPORTANT: for matching algo
@staticmethod
def get_random_province():
# todo: can prob use the ecanadian data to also return the provinces
return random.choice(PROVINCES)

# temporary function to get random first name
@staticmethod
def get_random_city():
# using the canadian localized data for the cities only
return fake_ca.city()

# IMPORTANT: for matching algo
@staticmethod
def get_random_language():
return random.choice(LANGUAGES)

# IMPORTANT: for matching algo
@staticmethod
def get_random_gender_identity():
return random.choice(Demographics.GENDER_IDENTITIES)
return random.choice(GENDER_IDENTITIES)

# IMPORTANT: for matching algo
@staticmethod
def get_random_pronouns():
return random.choice(PRONOUNS)

# IMPORTANT: for matching algo
@staticmethod
def get_random_ethnic_background():
return random.sample(Demographics.ETHNIC_GROUPS, k=random.randint(1, 3))
return random.choice(ETHNIC_GROUPS)

# IMPORTANT: for matching algo
@staticmethod
def get_random_age(min_age=18, max_age=90):
return random.randint(min_age, max_age)
def get_random_marital_status():
return random.choice(MARITAL_STATUSES)

# IMPORTANT: for matching algo
@staticmethod
def get_random_pronouns():
return random.choice(Demographics.PRONOUNS)
def get_random_children_status():
return random.choice(YES_NO)

#### FOR THE VOLUNTEER QUESITONS
@staticmethod
def get_random_city():
return random.choice(Demographics.CITIES)
def get_criminal_record_status():
return random.choice(YES_NO)
55 changes: 55 additions & 0 deletions backend/matching/data/data_category/medical_information.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import random
from datetime import datetime, timedelta
from backend.matching.data.config import (
DIAGNOSES,
TREATMENTS,
EXPERIENCES,
YES_NO,
CAREGIVING_TYPES,
)


class MedicalInformation:
# TODO: Add more roles and diagnoses (as we go and finalize the survey)
# TODO: can we move these field paramaters to a constants file?

@staticmethod
def get_random_blood_cancer_question():
return random.choice(YES_NO)

@staticmethod
def get_random_caregiver_question():
return random.choice(YES_NO)

@staticmethod
def get_random_caregiver_type():
return random.choice(CAREGIVING_TYPES)

# IMPORTANT: for matching algo
@staticmethod
def get_random_diagnosis():
category = random.choice(list(DIAGNOSES.keys()))
return random.choice(DIAGNOSES[category])

# IMPORTANT: for matching algo
@staticmethod
def get_random_date_of_diagnosis(start_year=1969, end_year=datetime.now().year):
start_date = datetime(start_year, 1, 1)
end_date = datetime(end_year, 12, 31)
random_date = start_date + timedelta(
days=random.randint(0, (end_date - start_date).days)
)
return random_date.strftime("%B %Y")

# IMPORTANT: for matching algo
@staticmethod
def get_random_treatment():
return random.choice(TREATMENTS)

# IMPORTANT: for matching algo
# there can be multiple experiences that they can select
@staticmethod
def get_random_experience():
num_experiences = random.randint(0, len(EXPERIENCES))
# reutrns empty of all of the experiences
return random.sample(EXPERIENCES, num_experiences)
48 changes: 0 additions & 48 deletions backend/matching/data/data_category/personality.py

This file was deleted.

Loading