Skip to content
This repository has been archived by the owner on Sep 9, 2020. It is now read-only.

Commit

Permalink
Adding prediction explanations support to batch scoring. (#124)
Browse files Browse the repository at this point in the history
* Adding reason codes support to batch scoring.
* Renaming reason codes to prediction explanations
* Add documentation and update rate limiting.
* Renaming the output fields to be explanation.
* Removing sampling change and fixing the linter.
* Reallinging the argument table.
* Renaming reason codes to prediction explanations
* Reallinging the argument table
* Old api and prediction explanations incompatible
1. Moved argument to dataset group and updated helpstring
2. Added possibility to put argument into config file
3. Added check for api version and prediction explanations
incompatibility
4. Tests
* Update prediction explanation doc
Both CHANGES and Readme

* Bump version to 1.15.0
  • Loading branch information
doleks authored and pprett committed Nov 23, 2018
1 parent ba73d7a commit a5f2379
Show file tree
Hide file tree
Showing 16 changed files with 794 additions and 62 deletions.
6 changes: 5 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
1.14.3 (Unreleased)
1.15.0 (Unreleased)
===================

Enhancements
------------
* Added new argument ``-max_prediction_explanations`` that allows batch scoring with predictions explanations and adds ``explanation_N_feature`` and ``explanation_N_strength`` to each row in output document (where ``N ∈ (1, max_prediction_explanations)`` )

1.14.2 (2018 Nov 14)
=======================

Expand Down
82 changes: 42 additions & 40 deletions README.rst

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion datarobot_batch_scoring/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.14.2'
__version__ = '1.15.0'
17 changes: 17 additions & 0 deletions datarobot_batch_scoring/api_response_handlers/pred_api_v10.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import operator
import json
from six.moves import zip

from datarobot_batch_scoring.exceptions import UnexpectedKeptColumnCount

Expand Down Expand Up @@ -70,6 +71,22 @@ def format_data(result, batch, **opts):
written_fields = out_fields[1:]
comb = [row[1:] for row in pred]

if 'predictionExplanations' in single_row:
num_reason_codes = len(single_row['predictionExplanations'])
for num in range(1, num_reason_codes + 1):
written_fields += [
'explanation_{0}_feature'.format(num),
'explanation_{0}_strength'.format(num)
]
for in_row, out_row in zip(result, comb):
reason_codes = []
for raw_reason_code in in_row['predictionExplanations']:
reason_codes += [
raw_reason_code['feature'],
raw_reason_code['strength']
]
out_row.extend(reason_codes)

return written_fields, comb


Expand Down
19 changes: 15 additions & 4 deletions datarobot_batch_scoring/batch_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def run_batch_predictions(base_url, base_headers, user, pwd,
max_batch_size=None, compression=None,
field_size_limit=None,
verify_ssl=True,
deployment_id=None):
deployment_id=None,
max_prediction_explanations=0):

if field_size_limit is not None:
csv.field_size_limit(field_size_limit)
Expand Down Expand Up @@ -122,13 +123,23 @@ def run_batch_predictions(base_url, base_headers, user, pwd,
base_headers['content-type'] = 'text/csv; charset=utf8'
if compression:
base_headers['Content-Encoding'] = 'gzip'

if import_id:
endpoint = base_url + '/'.join((import_id, 'predict'))
endpoint = base_url + import_id
elif deployment_id is not None:
endpoint = base_url + '/'.join(
('deployments', deployment_id, 'predictions'))
('deployments', deployment_id))
else:
endpoint = base_url + '/'.join((pid, lid))

if max_prediction_explanations:
endpoint += '/predictionExplanations?maxCodes=' + \
str(max_prediction_explanations)
else:
endpoint = base_url + '/'.join((pid, lid, 'predict'))
if deployment_id is not None:
endpoint += '/predictions'
else:
endpoint += '/predict'

encoding = investigate_encoding_and_dialect(
dataset=dataset,
Expand Down
18 changes: 17 additions & 1 deletion datarobot_batch_scoring/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from datarobot_batch_scoring import __version__
from datarobot_batch_scoring.api_response_handlers import (
RESPONSE_HANDLERS, PRED_API_V10)
RESPONSE_HANDLERS, PRED_API_V10, API_V1)
from datarobot_batch_scoring.batch_scoring import (run_batch_predictions)
from datarobot_batch_scoring.exceptions import ShelveError
from datarobot_batch_scoring.utils import (UI, get_config_file,
Expand Down Expand Up @@ -60,6 +60,7 @@ def parse_args(argv, standalone=False, deployment_aware=False):
'stdout': False,
'auto_sample': False,
'api_version': PRED_API_V10,
'max_prediction_explanations': 0
}
parser = argparse.ArgumentParser(
description=DESCRIPTION, epilog=EPILOG,
Expand Down Expand Up @@ -131,6 +132,13 @@ def parse_args(argv, standalone=False, deployment_aware=False):
dataset_gr.add_argument('dataset', type=str,
help='Specifies the .csv input file that '
'the script scores.')
dataset_gr.add_argument('--max_prediction_explanations',
type=int,
default=defaults['max_prediction_explanations'],
help='The maximum number of prediction '
'explanations that will be generate for '
'each prediction.'
'Not compatible with api version `api/v1`')

conn_gr = parser.add_argument_group('Connection control')
conn_gr.add_argument('--timeout', type=int,
Expand Down Expand Up @@ -324,6 +332,12 @@ def parse_generic_options(parsed_args):
dataset = parsed_args['dataset']
if not os.path.exists(dataset):
ui.fatal('file {} does not exist.'.format(dataset))
api_version = parsed_args['api_version']
max_prediction_explanations = parsed_args['max_prediction_explanations']
if api_version == API_V1 and max_prediction_explanations > 0:
ui.fatal('Prediction explanation is not available for '
'api_version `api/v1` please use the '
'`predApi/v1.0` or deployments endpoint')

ui.debug('batch_scoring v{}'.format(__version__))

Expand All @@ -348,6 +362,8 @@ def parse_generic_options(parsed_args):
'skip_row_id': skip_row_id,
'timeout': timeout,
'verify_ssl': parsed_args['verify_ssl'],
'max_prediction_explanations':
parsed_args['max_prediction_explanations'],
}


Expand Down
1 change: 1 addition & 0 deletions datarobot_batch_scoring/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def verify_objectid(value):
OptKey('field_size_limit'): t.Int,
OptKey('ca_bundle'): t.String,
OptKey('no_verify_ssl'): t.StrBool,
OptKey('max_prediction_explanations'): t.Int,
}).allow_extra('*')


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# batch scoring script may be installed in pretty outdated envs. So let's do it
# old-fashioned way by adding condition here.
#
# [1] https://github.com/agronholm/pythonfutures/commit/d0393ad626d25622927bb0ed47d35ddb2f6cd321
# [1] https://github.com/agronholm/pythonfutures/commit/d0393ad626d25622927bb0ed47d35ddb2f6cd321 # noqa: E501
# [2] https://www.python.org/dev/peps/pep-0508/#environment-markers
if sys.version_info[0] > 2:
install_requires = [req
Expand Down
11 changes: 11 additions & 0 deletions tests/fixtures/10kDiabetes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
readmitted,rowID,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,diag_1_desc,diag_2_desc,diag_3_desc
FALSE,1,Caucasian,Female,[50-60),?,Elective,Discharged to home,Physician Referral,1,CP,Surgery-Neuro,35,4,21,0,0,0,723,723,719,9,None,None,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Spinal stenosis in cervical region,Spinal stenosis in cervical region,"Effusion of joint, site unspecified"
FALSE,2,Caucasian,Female,[20-30),[50-75),Urgent,Discharged to home,Physician Referral,2,UN,?,8,5,5,0,0,0,664,648,285,6,None,None,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,"First-degree perineal laceration, unspecified as to episode of care or not applicable","Diabetes mellitus of mother, complicating pregnancy, childbirth, or the puerperium, unspecified as to episode of care or not applicable",Sideroblastic anemia
TRUE,3,Caucasian,Male,[80-90),?,Not Available,Discharged/transferred to home with home health service,,7,MC,Family/GeneralPractice,12,0,21,0,0,1,481,428,276,9,>200,None,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,Pneumococcal pneumonia [Streptococcus pneumoniae pneumonia],"Congestive heart failure, unspecified",Hyperosmolality and/or hypernatremia
FALSE,4,AfricanAmerican,Female,[50-60),?,Emergency,Discharged to home,Transfer from another health care facility,4,UN,?,33,1,5,0,0,0,682,41,250,3,None,None,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,Cellulitis and abscess of face,"Streptococcus infection in conditions classified elsewhere and of unspecified site, streptococcus, unspecified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled"
FALSE,5,AfricanAmerican,Female,[50-60),?,Emergency,Discharged to home,Emergency Room,5,?,Psychiatry,31,0,13,0,0,0,296,250.01,298,7,None,None,Steady,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,"Bipolar I disorder, single manic episode, unspecified","Diabetes mellitus without mention of complication, type I [juvenile type], not stated as uncontrolled",Depressive type psychosis
FALSE,6,Caucasian,Male,[70-80),?,Elective,Discharged to home,Physician Referral,4,?,Cardiology,29,0,10,0,0,0,428,427,414,8,None,None,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,"Congestive heart failure, unspecified",Paroxysmal supraventricular tachycardia,"Coronary atherosclerosis of unspecified type of vessel, native or graft"
FALSE,7,Caucasian,Female,[60-70),?,Elective,Expired,Physician Referral,6,MC,InternalMedicine,46,1,20,0,0,0,434,345,584,8,None,None,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,Cerebral thrombosis without mention of cerebral infarction,"Generalized nonconvulsive epilepsy, without mention of intractable epilepsy",Acute kidney failure
FALSE,8,Caucasian,Female,[50-60),?,Emergency,Discharged to home,Emergency Room,2,?,?,49,1,17,2,1,1,558,562,455,9,None,Norm,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,Other and unspecified noninfectious gastroenteritis and colitis,Diverticulosis of small intestine (without mention of hemorrhage),Internal hemorrhoids without mention of complication
FALSE,9,Caucasian,Male,[50-60),?,,Discharged to home,,3,?,Family/GeneralPractice,54,0,10,0,0,1,428,425,70,9,None,None,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,"Congestive heart failure, unspecified",Endomyocardial fibrosis,Viral hepatitis A with hepatic coma
TRUE,10,Caucasian,Male,[60-70),?,Elective,Discharged to home,Physician Referral,5,?,Surgery-Cardiovascular/Thoracic,47,2,12,0,0,0,440,998,998,5,None,None,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,Atherosclerosis of aorta,"Postoperative shock, unspecified","Postoperative shock, unspecified"
11 changes: 11 additions & 0 deletions tests/fixtures/10kDiabetes_5explanations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
row_id,0,1,explanation_1_feature,explanation_1_strength,explanation_2_feature,explanation_2_strength,explanation_3_feature,explanation_3_strength,explanation_4_feature,explanation_4_strength,explanation_5_feature,explanation_5_strength
0,0.7374983461,0.2625016539,medical_specialty,-0.2230822974,number_diagnoses,0.2010719684,diag_1,-0.1882141621,number_inpatient,-0.1584939956,diag_3,-0.1538334979
1,0.7534670614,0.2465329386,weight,0.4616938769,diag_2,-0.2817732676,payer_code,-0.1999593278,age,-0.1700208122,num_lab_procedures,-0.1699786261
2,0.673941752,0.326058248,discharge_disposition_id,0.3047681596,number_inpatient,0.2266360201,medical_specialty,-0.2049645033,admission_source_id,-0.1710960504,num_lab_procedures,-0.1579545292
3,0.884682017,0.115317983,number_diagnoses,-0.4503520826,diag_2,-0.2808908801,admission_source_id,-0.2398130772,payer_code,-0.2143588931,number_inpatient,-0.1707111743
4,0.7116849878,0.2883150122,medical_specialty,-0.1814823805,race,-0.1575075199,number_inpatient,-0.1568230769,admission_source_id,0.1354087199,diag_3,-0.13507482
5,0.5909996414,0.4090003586,medical_specialty,-0.1884195522,diag_2,-0.1807624347,number_inpatient,-0.142271611,admission_type_id,0.1354998853,number_diagnoses,0.1320297742
6,0.8647944819,0.1352055181,discharge_disposition_id,-1.1295014348,medical_specialty,-0.2076687977,number_inpatient,-0.1461341051,payer_code,-0.1242433687,admission_type_id,0.0949499145
7,0.5144554583,0.4855445417,number_inpatient,0.2826505121,diag_1,-0.2274929185,number_emergency,0.2071603004,number_diagnoses,0.1620381118,discharge_disposition_id,0.1560600053
8,0.3746314486,0.6253685514,admission_type_id,0.5009928124,number_inpatient,0.2401329203,medical_specialty,-0.1651539909,admission_source_id,-0.1571784835,number_diagnoses,0.1377384773
9,0.7117755608,0.2882244392,medical_specialty,-0.3821848966,number_diagnoses,-0.273826215,diag_3,0.229366976,number_inpatient,-0.1604879998,admission_type_id,0.1231549458
Loading

0 comments on commit a5f2379

Please sign in to comment.