You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
privacykit/notebooks/registered-tier.ipynb

33 KiB

None <html lang="en"> <head> </head>
In [9]:
"""
This notebook is designed to run experiments around demographics on registered tier
The 
"""
import pandas as pd
import numpy as np
from pandas_risk import *

ATTRIBUTES = ['race','ethnicity','birth_date','state','city','zip','marital_status','education','language','home_owner','income','employment_status','living_situation','active_duty_status','gender_identity','birth_place','death_date','death_cause','orientation']
dfs = pd.read_csv('scenario-settings.csv')
dfc = pd.read_gbq("SELECT * FROM deid_risk.registered_dec_01",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [10]:
cols_o = dfs.loc[(dfs.fo & dfs.fi) ==1].feature.tolist()
cols_i = dfs.loc[(dfs.fo + dfs.fi )>=1 ].feature.tolist()
cols_a = dfs.feature.tolist()

cols_v = ['birth_date','gender_identity','race','state','city','birth_place'] #-- voter registration
#remove the dates fields because dates are shifted
cols_o = [i for i in cols_o if i not in ['birth_date','death_date']]
cols_i = [i for i in cols_i if i not in ['birth_date','death_date']]
cols_a = [i for i in cols_a if i not in ['birth_date','death_date']]
cols_v = [i for i in cols_v if i not in ['birth_date', 'death_date']]
In [11]:
# print(dfs)
# print(cols_o)
# print(cols_i)
In [12]:
r = pd.concat([dfc[cols_o].deid.evaluate(),dfc[cols_i].deid.evaluate(),dfc[cols_a].deid.evaluate(),dfc[cols_v].deid.evaluate() ])
r.index = np.arange(r.shape[0]).astype(np.int64)
r['flag']=['high-conj','high-disj','all','voter-reg']

r
Out[12]:
field_count flag group_count marketer prosecutor unique_row_ratio
0 8 high-conj 6532 0.056234 1.0 0.021368
1 11 high-disj 47447 0.408473 1.0 0.278554
2 16 all 60718 0.522724 1.0 0.408189
3 5 voter-reg 1316 0.011329 1.0 0.002944
In [14]:
fig_o = r.plot(kind='bar',x='flag',y=['marketer']).get_figure()
No description has been provided for this image
In [15]:
writer = pd.ExcelWriter('out-116kpatients-phase-1.xlsx',engine='xlsxwriter')
r.to_excel(writer,'phase-1')
writer.save()
In [19]:
dfs
Out[19]:
feature fi fo
0 race 1 1
1 ethnicity 1 1
2 birth_date 1 1
3 city 1 1
4 state 1 1
5 marital_status 1 1
6 education 1 0
7 language 0 0
8 home_owner 1 1
9 income 0 1
10 employment_status 1 0
11 living_situation 0 0
12 active_duty_status 0 0
13 gender_identity 1 1
14 birth_place 0 0
15 death_date 1 1
16 death_cause 1 1
17 orientation 0 0
In [38]:
import pandas as pd
import numpy as np
names = pd.read_csv('family-history.csv').name.tolist()
path  ='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json'
sql   = """
SELECT * FROM deid_risk.registered_medical_history_dec_001
"""
dfm = pd.read_gbq("SELECT * FROM deid_risk.registered_medical_history_dec_001",private_key=path,dialect='standard')
In [69]:
cols = list( set(dfm.columns.tolist()) - set(['person_id']))
r = pd.DataFrame(dfm[cols].count(),columns=['counts'])
r['attributes'] = r.index
r['rate']  = 100*(r.counts / dfm.shape[0])
r.rate.mean(),np.sqrt(r.rate.var())
Out[69]:
(0.9343780009344719, 1.269831148073964)
In [81]:
writer = pd.ExcelWriter('/home/steve/tmp/simple.xlsx', engine='xlsxwriter')
r.to_excel(writer,sheet_name='p1')
workbook  = writer.book
worksheet = workbook.add_worksheet()
b = pd.DataFrame({"id":np.random.choice(10,30)})
In [80]:
dir(worksheet)
Out[80]:
['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_assemble_xml_file',
 '_button_params',
 '_calculate_spans',
 '_calculate_x_split_width',
 '_check_dimensions',
 '_comment_params',
 '_convert_date_time',
 '_convert_name_area',
 '_csv_join',
 '_encode_password',
 '_escape_attributes',
 '_escape_data',
 '_escape_url',
 '_extract_filter_tokens',
 '_get_palette_color',
 '_get_range_data',
 '_initialize',
 '_isinf',
 '_isnan',
 '_opt_close',
 '_opt_reopen',
 '_parse_filter_expression',
 '_parse_filter_tokens',
 '_position_object_emus',
 '_position_object_pixels',
 '_prepare_chart',
 '_prepare_header_image',
 '_prepare_header_vml_objects',
 '_prepare_image',
 '_prepare_shape',
 '_prepare_tables',
 '_prepare_vml_objects',
 '_set_filehandle',
 '_set_icon_props',
 '_set_spark_color',
 '_set_xml_writer',
 '_size_col',
 '_size_row',
 '_sort_pagebreaks',
 '_table_function_to_formula',
 '_write',
 '_write_array_formula',
 '_write_auto_filter',
 '_write_autofilters',
 '_write_blank',
 '_write_boolean',
 '_write_brk',
 '_write_cell',
 '_write_cell_array_formula',
 '_write_cell_value',
 '_write_cf_rule',
 '_write_cfvo',
 '_write_col_breaks',
 '_write_col_info',
 '_write_color',
 '_write_color_axis',
 '_write_color_first',
 '_write_color_high',
 '_write_color_last',
 '_write_color_low',
 '_write_color_markers',
 '_write_color_negative',
 '_write_color_scale',
 '_write_color_series',
 '_write_cols',
 '_write_conditional_formats',
 '_write_conditional_formatting',
 '_write_conditional_formatting_2010',
 '_write_custom_filter',
 '_write_custom_filters',
 '_write_data_bar',
 '_write_data_bar_ext',
 '_write_data_validation',
 '_write_data_validations',
 '_write_datetime',
 '_write_dimension',
 '_write_drawing',
 '_write_drawings',
 '_write_empty_row',
 '_write_ext',
 '_write_ext_list',
 '_write_ext_list_data_bars',
 '_write_ext_list_sparklines',
 '_write_filter',
 '_write_filter_column',
 '_write_filters',
 '_write_font',
 '_write_formula',
 '_write_formula_1',
 '_write_formula_2',
 '_write_formula_element',
 '_write_freeze_panes',
 '_write_header_footer',
 '_write_hyperlink_external',
 '_write_hyperlink_internal',
 '_write_hyperlinks',
 '_write_icon_set',
 '_write_legacy_drawing',
 '_write_legacy_drawing_hf',
 '_write_merge_cell',
 '_write_merge_cells',
 '_write_number',
 '_write_odd_footer',
 '_write_odd_header',
 '_write_optimized_sheet_data',
 '_write_outline_pr',
 '_write_page_margins',
 '_write_page_set_up_pr',
 '_write_page_setup',
 '_write_panes',
 '_write_phonetic_pr',
 '_write_print_options',
 '_write_rich_string',
 '_write_row',
 '_write_row_breaks',
 '_write_rows',
 '_write_rstring_color',
 '_write_selection',
 '_write_selections',
 '_write_sheet_data',
 '_write_sheet_format_pr',
 '_write_sheet_pr',
 '_write_sheet_protection',
 '_write_sheet_view',
 '_write_sheet_views',
 '_write_single_row',
 '_write_spark_color',
 '_write_sparkline_group',
 '_write_sparkline_groups',
 '_write_sparklines',
 '_write_split_panes',
 '_write_string',
 '_write_tab_color',
 '_write_table_part',
 '_write_table_parts',
 '_write_token_as_string',
 '_write_underline',
 '_write_url',
 '_write_vert_align',
 '_write_worksheet',
 '_write_x14_axis_color',
 '_write_x14_border_color',
 '_write_x14_cf_rule',
 '_write_x14_cfvo',
 '_write_x14_data_bar',
 '_write_x14_negative_border_color',
 '_write_x14_negative_fill_color',
 '_xml_close',
 '_xml_data_element',
 '_xml_declaration',
 '_xml_empty_tag',
 '_xml_empty_tag_unencoded',
 '_xml_end_tag',
 '_xml_formula_element',
 '_xml_inline_string',
 '_xml_number_element',
 '_xml_rich_inline_string',
 '_xml_rich_si_element',
 '_xml_si_element',
 '_xml_start_tag',
 '_xml_start_tag_unencoded',
 '_xml_string_element',
 'activate',
 'active',
 'active_pane',
 'add_sparkline',
 'add_table',
 'autofilter',
 'autofilter_area',
 'autofilter_ref',
 'black_white',
 'buttons_list',
 'center_horizontally',
 'center_vertically',
 'charts',
 'col_formats',
 'col_size_changed',
 'col_sizes',
 'colinfo',
 'comments',
 'comments_author',
 'comments_list',
 'comments_visible',
 'cond_formats',
 'conditional_format',
 'constant_memory',
 'data_bars_2010',
 'data_validation',
 'date_1904',
 'default_col_pixels',
 'default_date_format',
 'default_row_height',
 'default_row_pixels',
 'default_row_zeroed',
 'default_url_format',
 'dim_colmax',
 'dim_colmin',
 'dim_rowmax',
 'dim_rowmin',
 'draft_quality',
 'drawing',
 'drawing_links',
 'dxf_priority',
 'escapes',
 'excel2003_style',
 'excel_version',
 'ext_sheets',
 'external_comment_links',
 'external_drawing_links',
 'external_hyper_links',
 'external_table_links',
 'external_vml_links',
 'fh',
 'fileclosed',
 'filter_cols',
 'filter_column',
 'filter_column_list',
 'filter_on',
 'filter_range',
 'filter_type',
 'fit_height',
 'fit_page',
 'fit_to_pages',
 'fit_width',
 'footer',
 'footer_images',
 'freeze_panes',
 'get_name',
 'has_comments',
 'has_header_vml',
 'has_vml',
 'hbreaks',
 'hcenter',
 'header',
 'header_footer_aligns',
 'header_footer_changed',
 'header_footer_scales',
 'header_images',
 'header_images_list',
 'hidden',
 'hide',
 'hide_gridlines',
 'hide_row_col_headers',
 'hide_zero',
 'hlink_count',
 'hlink_refs',
 'horizontal_dpi',
 'hyperlinks',
 'images',
 'index',
 'insert_button',
 'insert_chart',
 'insert_image',
 'insert_textbox',
 'internal_fh',
 'is_chartsheet',
 'is_right_to_left',
 'last_shape_id',
 'leading_zeros',
 'margin_bottom',
 'margin_footer',
 'margin_header',
 'margin_left',
 'margin_right',
 'margin_top',
 'merge',
 'merge_range',
 'name',
 'names',
 'nan_inf_to_errors',
 'orientation',
 'original_row_height',
 'outline_below',
 'outline_changed',
 'outline_col_level',
 'outline_on',
 'outline_right',
 'outline_row_level',
 'outline_settings',
 'outline_style',
 'page_order',
 'page_setup_changed',
 'page_start',
 'page_view',
 'palette',
 'panes',
 'paper_size',
 'previous_row',
 'print_across',
 'print_area',
 'print_area_range',
 'print_comments',
 'print_gridlines',
 'print_headers',
 'print_options_changed',
 'print_row_col_headers',
 'print_scale',
 'protect',
 'protect_options',
 'rel_count',
 'remove_timezone',
 'repeat_col_range',
 'repeat_columns',
 'repeat_row_range',
 'repeat_rows',
 'right_to_left',
 'row_col_headers',
 'row_data_fh',
 'row_data_fh_closed',
 'row_data_filename',
 'row_size_changed',
 'row_sizes',
 'row_spans',
 'rstring',
 'screen_gridlines',
 'select',
 'selected',
 'selections',
 'set_cols',
 'set_column',
 'set_comments_author',
 'set_default_row',
 'set_first_sheet',
 'set_footer',
 'set_h_pagebreaks',
 'set_header',
 'set_landscape',
 'set_margins',
 'set_page_view',
 'set_paper',
 'set_portrait',
 'set_print_scale',
 'set_row',
 'set_rows',
 'set_selection',
 'set_start_page',
 'set_tab_color',
 'set_v_pagebreaks',
 'set_vba_name',
 'set_zoom',
 'shape_hash',
 'shapes',
 'show_comments',
 'show_zeros',
 'sparklines',
 'split_panes',
 'str_table',
 'strings_to_formulas',
 'strings_to_numbers',
 'strings_to_urls',
 'tab_color',
 'table',
 'tables',
 'tmpdir',
 'use_data_bars_2010',
 'validations',
 'vba_codename',
 'vbreaks',
 'vcenter',
 'vertical_dpi',
 'vml_data_id',
 'vml_drawing_links',
 'vml_header_id',
 'vml_shape_id',
 'worksheet_meta',
 'write',
 'write_array_formula',
 'write_blank',
 'write_boolean',
 'write_column',
 'write_comment',
 'write_datetime',
 'write_formula',
 'write_match',
 'write_number',
 'write_rich_string',
 'write_row',
 'write_string',
 'write_url',
 'xls_colmax',
 'xls_rowmax',
 'xls_strmax',
 'zoom',
 'zoom_scale_normal']
In [ ]:

</html>