parent
31c158149f
commit
b1796de6fc
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@ -0,0 +1,8 @@
|
|||||||
|
from ubuntu
|
||||||
|
RUN ["apt-get","update"]
|
||||||
|
RUN ["apt-get","upgrade","-y"]
|
||||||
|
RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip","python3-numpy","python3-pandas","locales"]
|
||||||
|
RUN ["pip3","install","pandas-gbq","tensorflow"]
|
||||||
|
RUN ["mkdir","-p","/usr/apps"]
|
||||||
|
WORKDIR /usr/apps
|
||||||
|
RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/gan.git","aou-gan"]
|
File diff suppressed because one or more lines are too long
Binary file not shown.
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"type": "service_account",
|
||||||
|
"project_id": "aou-res-curation-prod",
|
||||||
|
"private_key_id": "ecbf77975c5b7b1f4d4b1680bf67a5e0fd11dfaf",
|
||||||
|
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDcDBFIIOYtSlKU\njA5xBaRYk6+jccisNrvfBT3Lue9Kqy6ad35V+gUGqZ18bIKAgziQy8m5Iiw5VDFm\nZslN4yUFy05g0k+XY0HYq2rqeMeThH9pQ/xfN6XsOy8ED9eONzvnhuX2Jbevm9qr\n0xSIAkFsBYGNs+NjgU4fSmLfptAt2rVs19BI1qRPVBBzA640+hWIATVqr0ibnpE7\ndl7ONzWPWgmgHfhu2A1MjYO5obNJ4NyZ8Y86aAa7N098r+388QGX3XN2I9rWfKIj\nUGEpapmEKwnb8zC92v9GaT2qfbO2vdNsRYE930LGmxlmW+Ji7YO5FRaitvuk2iMU\n7g8d/GZZAgMBAAECggEAIUXi3Bb7byhAne0ipuTtQaaNRafoKeA53sJ+Yl6aaB5D\n1QASFqKQXX5KzbxjrFaLOFvURB3+dWm9cYhD0rbwy3Q/RQUwG0pbM83RxCQgu3Xq\nxSpOUECMIpEdbh4OIFdKQ9tiTOrNoGxu75HiliFPLqwTd6+Wh96Ox0z6b+qbqn8S\nqcEK0JQXvzC1BbR7vhsySIFP5hz8F0JThm94B3tiClzsixGCk6wydXuPs64x3rGt\nZ57dxBQBUVxYmaI3LQ/1cm7nv7uqfbUHDZrpLzE6/AevP5iNyzY1bkdUJ45mj2Ay\nWhqW9ftOhyRE9C2djPcopgrjRPbH/U0491tTLuc2XQKBgQDyp08o7mEz97/aGWmr\nNj3+QjBwNoDkdiR3qUrgohLwSNahSpiPv9+yjGvfXHQUxNyJfJ2Zei5bSTCjqKTk\nNq4QmvO4gsEhABOuqU0U0NlrpGSj0amwrCrqh7gxG/tnSuVEOzEKbY9g0CaXlg1O\nbJtP8yvicJc7m/5RxLKI8LoW1QKBgQDoJnKv8+JZc/16FI/4bU8BwUUHRiazWDIt\n9aCt63h+Fs6PAAFuGo04lobQEukbwU3EB63jWKCaxGJkjh+/lLkTelzRlVyVs0N0\nOb9WL4vYtwMrmtXKPfqKmJS81qwlLHA0+YBeE56uElwyFMAEsIIRb4YjffZd3Cy9\nT19cMSmbdQKBgBo046HCDRF1wmylrfnlw9BACcc0u7rw34Nk70dPecgltbh5u/xa\ndqhr7gKTk53inQbkRIkc3wDQ6MXkItra5PW6JnRY+s67mWSVuFN1MuYjPRNMQ41n\nKsNloQj8wqwnNJen5OYBayjDkkdw10MPC78YvjaYflzbvh3KppWPmil5AoGBAKID\nWxyynrABA9A0E3mzh2TZJbx6171n+rUaa8WUxKVycztXLKhTfWUVoAYMfISzNftt\nxIwaKRN5pJU6nquMNlGCns5hZ5jN33B4cLDMQ9O9fUfsKfGXqYcaDwtu4fqbdb9y\ntIRzOtWO2KrW0l8zc8KJS1rvqIU+iDah8xIa+UeVAoGBAKagVX0AN1RwoRg7LKdZ\n9eMQeYfaeVrfbxMDqEluEJzAQbvRoPZ75UNMre+vTOHLZuPF9uT+N71amgkKaL1T\nV1qWzNBU0bvpD9xvdCJWmypoccV2by1Nj2rPll5wfg1CPhmEQuNB30YLOTAws9Tc\nmb0kWAwnL39cUQyXJ5zBGd3K\n-----END PRIVATE KEY-----\n",
|
||||||
|
"client_email": "aou-res-curation-prod@appspot.gserviceaccount.com",
|
||||||
|
"client_id": "",
|
||||||
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||||
|
"token_uri": "https://oauth2.googleapis.com/token",
|
||||||
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||||
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/aou-res-curation-prod%40appspot.gserviceaccount.com"
|
||||||
|
}
|
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"type": "service_account",
|
||||||
|
"project_id": "aou-res-curation-test",
|
||||||
|
"private_key_id": "be9cb7427212dea882379d125530f5339ba854a7",
|
||||||
|
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCljt1hxwqInD2U\nKLv9SQX08tE0+APKzOH4Cw3vZt495aAlmKpsRt0ze3HdobouOQXzySqJZqHqgK3k\n5oqjlXOEFVrupIO5WnujGA3Ln7SP9vK1fjiNeKFvX/W+ePULRsOp1pZts53p5OzU\nS2PU2UowAVip9iJAjTeLpoF4cYjHG1jM4oYIRq8mCtuBNmsNE6peY4lWrlouHIvy\nPKJOAQ0kwEbtxsVfEBYqvcb8X5NSFi4/gwP5y1z8ALjQ3eLJjcqPfsAGI2Lpf7Ah\nM+RbW3rkT0FKCbUjUY1NNhQKguDdzeTGModjGyQxp3Y7qT1LHOvRKIZXb3Ft3f1C\nHyUsytJlAgMBAAECggEAEk8TXS3VlLgFTgOXOUfrGGSwuDU5Y3h3APxlT8rGMdLZ\nfBmUYfcQSBI9zG8x7MyyQ3yaxKk3Uidlk8E0fH9u+qDLS/BLqk2UYLwB7Tk95FyW\nCMuVq3ziCt7HiYdM6jCq5iHCGbhZyApgxTWKgSPVQtZ98gXd/IThgK3VoaFEqWgc\nsVDO/AokZF56luDHzISALh3+LhsoYxerTP43XA4jDv4i/qzmDAwUcBf1mI76qaOZ\nOwoETJre+kaI61ZqVcnGteSVnvfb8Z5e5Gvwtig2akiNbT+E/HeTfQiCTJ9r5dUA\n5U1r4O4Veu0gLENvK4NE0kdn3k6BTLeOljuxIXzHPQKBgQDk2/0SdhZYScAsXts+\nFl3FJbVU409szRX/uUWtBjD2sIm9GRYmBv07Kk3MV+Egh8e9Ps/wjb6fxbhlEVGf\nvbPuR9pn4Ci+fllH7TWsy1atcyZaZXD22/eHOXOjiE+rFUAfO94fXIxVXtB0yuxe\nf+zQ6rltpn5ttZBQghYsm0weawKBgQC5MRftZqf+d7AzP5Pj4mPUhxmwrHNWaryw\nHAqer4X9kjS/zBlHWQjdqA9rpFgXzaETRY5sbC9ef2FgCG2mSAYEvyYBA4L8t71s\ntUO/v3VgSs0xheOnAI8RFnq5g5Bbzd4IPZB+dEq9gPph+P/QLCFpRX0LFzhOwkrx\nvaicvMFmbwKBgHFL5tEI3K8Ac76Dhw4JjIpYzJgln+BA9y8NzUyG0B6P7uBKVwik\nVSDBJJqQtsaf8WXifpab1U7LVynRlRL7muPPdnQOKJ2FdzWAXR4Z2+MqKkZ+CZpr\n8vJiorjGdoo/jurnfGMSMfbhZVksTC/MLLSQPxPlZJlzVOpGPCwBBYHZAoGALmKE\nirresxcJdByljzuiI5ZfMehPz0JW1ol/g3WVSwj2219kqYE8fkBc9GoqgnPHt4sB\nfFiwmKuxGRujUzXRBBlYjIJzqZbgBD12pa1v2dmCgbf2aFr0eqQ1wweX/daXmVrK\nOVIpckO+8xEqCdsz1ylHg6KiQN/bY6dMd02z51MCgYAZRr+hutgjPOQEdWujFgkL\noTypuWmqvdQmOj8L0o3wee2D5ScMx7obtoKhYP+FpC4U9xgoImiS9hLL3FurwVNi\n36GEodFO7iTFnBowJFp+COW5xX8ISEc0LVKkFoHyMfXZa+zxFWRTRRRwzmnHGBq+\nRY2vrlcCx36QEcQdwFR72Q==\n-----END PRIVATE KEY-----\n",
|
||||||
|
"client_email": "aou-res-curation-test@appspot.gserviceaccount.com",
|
||||||
|
"client_id": "",
|
||||||
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||||
|
"token_uri": "https://oauth2.googleapis.com/token",
|
||||||
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||||
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/aou-res-curation-test%40appspot.gserviceaccount.com"
|
||||||
|
}
|
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"type": "service_account",
|
||||||
|
"project_id": "aou-res-curation-test",
|
||||||
|
"private_key_id": "1ed8d298e4b5572e7556b2f079133ea04568396a",
|
||||||
|
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCwf6bQeRLpKhlh\nvFIjiZvdu68mm/6x37zmH5jBoCv8fWteUbqSwyPzz3954qLpENvad/ob+ELHMOnU\n9LNgRwKTYNuBpRRPELdh95lF7zmHyat6GRA0Y6ofIU1ScjzJlQlFQ8+PnYNTrpgh\nqOtmgLAbI7S0mMsQodXsZwuPyHW0nf5CCY7gnXuThxmTt46hY7Zd9WCKH7ra+P9W\nqzHRRQyuC/43bQIFuHP1wWUrmnJCbypWcgRQEKfoWyXKVEsNGcbUMYLFOixraQuk\n8mhRgncMe2dR7R2M60Q0fLKAF4CZDfG2N70ACoIY8sFGkXSjFG2rlok7spYwdYIX\nafKQHbZ1AgMBAAECggEAAzb7NfSs/5psACFuFsY3+xFvyF9ZNvcxMx9wzyU/BKg2\n9buKXCFgY12S3+72jBIDcL0ns2CE76jet9zFjNbheQeJTmXp2UjS9kTywaXXIYSI\nWL7h6/pdJZg1ltW/pEvp8WnuewCukC5WO6K4HiCKh9Jq+H3uxCMWfB0iX+BevuC4\n4FEC0eJ6BD5rI8gUr5HO8VtCgxW99dJHgrdx+rRlJEaeY5FwGLW/ITBjsV0S48Pl\nvxcpHWbUCn13tE1EWR0QhFyazUWw8xqdY1+H++ku45pAZuMxYlFGkhssBbVJqYwP\nMjkjy9z4n/xXUVj7iwTWweQ2dvZracKEmBP2szAJkQKBgQDeGXMRJ9UK79Anewm/\nqXQWKzMYzwcWT+1hTfUweL5SMzTjO/tuqMsDqndCeAacpLUOxXjIcINWP/P1+VDH\nIFj8JpQMw86t2JUwMqcmSk44he85wfT3qgoxe6LglQIbWgV6cZY1OKnkuKIln2FW\nlpGdiSRRM430+wN2Fq9YsFRimQKBgQDLcFlBVL0NTC+wlD72I6/dSKeWNpz50epp\ng4qOg3zq7rMa8S/O/m1w3ZFyeAz+E4LA1Mi42MQPr6vhGFPczQgoPboe2rCnIPqR\nnFhkWqLBTk7BgmqnZV1lzrdvosmGscOdfQwnw8gNDe1KjAmPQvdP95qGcYKh5kKu\nxz3P3S74PQKBgAZ9YeJfcpc2OLPeoYNLNUwsiPqxmfhp73rHZ2G6NX17Z5E4QHmU\nTxJVWdTEYxUSrwO2e3gH6Z6MkdlfJqAa7t63Vd4lnpVv3bQh1saEp1J5f2sFot3V\nxyR5A2JimEQqVjykswntFPHM/1fwF00La0faKQiCZCSDbS93LDqANIcJAoGBAJmE\nc2YweuVA+6/lfsmhToHO5OAe4EBI3vq2j+VRZf+nFzMalDhAmPeVy780xqEouf+n\n0rxinzkzGKIpCIfTlPdA9WV5I9tKsKsW70DzgGQdIqM2NiOSA3PjFVvB3Q+ur231\nwilzvU/UlZ8uo7wfDZ+julD/8VMY/nMD2So1v88FAoGACPUobP69SukbZIl4rYLL\nAZEcxlQCOP/2nWGY7spReiIZKqXCkwMElR8r41//Kb6/h0knKlW8NsC2vpvOBgHO\nG7ZYooscHP8v203lPtGykaBA1xeFY5NKD0gGAG+CmSLorM8cYMUv4RXrIOtmAgrG\nXdLo0jPwQXGSTqOdPvNqBi0=\n-----END PRIVATE KEY-----\n",
|
||||||
|
"client_email": "aou-res-curation-test@appspot.gserviceaccount.com",
|
||||||
|
"client_id": "",
|
||||||
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||||
|
"token_uri": "https://oauth2.googleapis.com/token",
|
||||||
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||||
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/aou-res-curation-test%40appspot.gserviceaccount.com"
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
|||||||
|
SELECT
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_id' AND CAST(values AS STRING)=CAST(observation.observation_id AS STRING ) ) as observation_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'person_id' AND CAST(values AS STRING)=CAST(observation.person_id AS STRING ) ) as person_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_concept_id' AND CAST(values AS STRING)=CAST(observation.observation_concept_id AS STRING ) ) as observation_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_date' AND CAST(values AS STRING)=CAST(observation.observation_date AS STRING ) ) as observation_date,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_datetime' AND CAST(values AS STRING)=CAST(observation.observation_datetime AS STRING ) ) as observation_datetime,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_type_concept_id' AND CAST(values AS STRING)=CAST(observation.observation_type_concept_id AS STRING ) ) as observation_type_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'value_as_number' AND CAST(values AS STRING)=CAST(observation.value_as_number AS STRING ) ) as value_as_number,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'value_as_string' AND CAST(values AS STRING)=CAST(observation.value_as_string AS STRING ) ) as value_as_string,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'value_as_concept_id' AND CAST(values AS STRING)=CAST(observation.value_as_concept_id AS STRING ) ) as value_as_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'qualifier_concept_id' AND CAST(values AS STRING)=CAST(observation.qualifier_concept_id AS STRING ) ) as qualifier_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'unit_concept_id' AND CAST(values AS STRING)=CAST(observation.unit_concept_id AS STRING ) ) as unit_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'provider_id' AND CAST(values AS STRING)=CAST(observation.provider_id AS STRING ) ) as provider_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'visit_occurrence_id' AND CAST(values AS STRING)=CAST(observation.visit_occurrence_id AS STRING ) ) as visit_occurrence_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_source_value' AND CAST(values AS STRING)=CAST(observation.observation_source_value AS STRING ) ) as observation_source_value,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'observation_source_concept_id' AND CAST(values AS STRING)=CAST(observation.observation_source_concept_id AS STRING ) ) as observation_source_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'unit_source_value' AND CAST(values AS STRING)=CAST(observation.unit_source_value AS STRING ) ) as unit_source_value,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'qualifier_source_value' AND CAST(values AS STRING)=CAST(observation.qualifier_source_value AS STRING ) ) as qualifier_source_value,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'value_source_concept_id' AND CAST(values AS STRING)=CAST(observation.value_source_concept_id AS STRING ) ) as value_source_concept_id,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'value_source_value' AND CAST(values AS STRING)=CAST(observation.value_source_value AS STRING ) ) as value_source_value,
|
||||||
|
(SELECT encoded FROM wgan_original_pseudo.map WHERE table='observation' AND field = 'questionnaire_response_id' AND CAST(values AS STRING)=CAST(observation.questionnaire_response_id AS STRING ) ) as questionnaire_response_id
|
||||||
|
FROM wgan_original.observation
|
||||||
|
WHERE
|
||||||
|
REGEXP_CONTAINS(UPPER(observation_source_value),'ICD')
|
|
@ -0,0 +1,546 @@
|
|||||||
|
"""
|
||||||
|
usage :
|
||||||
|
optional :
|
||||||
|
--num_gpu number of gpus to use will default to 1
|
||||||
|
--epoch steps per epoch default to 256
|
||||||
|
"""
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.contrib.layers import l2_regularizer
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from params import SYS_ARGS
|
||||||
|
from bridge import Binary
|
||||||
|
import json
|
||||||
|
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
|
||||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
||||||
|
|
||||||
|
# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
|
||||||
|
# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
|
||||||
|
# BATCHSIZE_PER_GPU = 2000
|
||||||
|
# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS
|
||||||
|
|
||||||
|
class void :
|
||||||
|
pass
|
||||||
|
class GNet :
|
||||||
|
"""
|
||||||
|
This is the base class of a generative network functions, the details will be implemented in the subclasses.
|
||||||
|
An instance of this class is accessed as follows
|
||||||
|
object.layers.normalize applies batch normalization or otherwise
|
||||||
|
obect.get.variables instanciate variables on cpu and return a reference (tensor)
|
||||||
|
"""
|
||||||
|
def __init__(self,**args):
|
||||||
|
self.layers = void()
|
||||||
|
self.layers.normalize = self.normalize
|
||||||
|
|
||||||
|
self.get = void()
|
||||||
|
self.get.variables = self._variable_on_cpu
|
||||||
|
|
||||||
|
self.NUM_GPUS = 1
|
||||||
|
|
||||||
|
|
||||||
|
self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
|
||||||
|
self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
|
||||||
|
self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
|
||||||
|
# self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
|
||||||
|
if 'label' in args and len(args['label'].shape) == 2 :
|
||||||
|
self.NUM_LABELS = args['label'].shape[1]
|
||||||
|
elif 'label' in args and len(args['label']) == 1 :
|
||||||
|
self.NUM_LABELS = args['label'].shape[0]
|
||||||
|
else:
|
||||||
|
self.NUM_LABELS = 8
|
||||||
|
self.Z_DIM = 128 #self.X_SPACE_SIZE
|
||||||
|
self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256
|
||||||
|
self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
|
||||||
|
self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)
|
||||||
|
self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
|
||||||
|
self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
|
||||||
|
self.CONTEXT = args['context']
|
||||||
|
self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
|
||||||
|
self._REAL = args['real'] if 'real' in args else None
|
||||||
|
self._LABEL = args['label'] if 'label' in args else None
|
||||||
|
|
||||||
|
self.init_logs(**args)
|
||||||
|
|
||||||
|
def init_logs(self,**args):
|
||||||
|
self.log_dir = args['logs'] if 'logs' in args else 'logs'
|
||||||
|
self.mkdir(self.log_dir)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
for key in ['train','output'] :
|
||||||
|
self.mkdir(os.sep.join([self.log_dir,key]))
|
||||||
|
self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
|
||||||
|
|
||||||
|
self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])
|
||||||
|
self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
|
||||||
|
|
||||||
|
def load_meta(self,column):
|
||||||
|
"""
|
||||||
|
This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
|
||||||
|
Because prediction and training can happen independently
|
||||||
|
"""
|
||||||
|
_name = os.sep.join([self.out_dir,'meta-'+column+'.json'])
|
||||||
|
if os.path.exists(_name) :
|
||||||
|
attr = json.loads((open(_name)).read())
|
||||||
|
for key in attr :
|
||||||
|
value = attr[key]
|
||||||
|
setattr(self,key,value)
|
||||||
|
self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])
|
||||||
|
self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
|
||||||
|
|
||||||
|
|
||||||
|
def log_meta(self,**args) :
|
||||||
|
object = {
|
||||||
|
'CONTEXT':self.CONTEXT,
|
||||||
|
'ATTRIBUTES':self.ATTRIBUTES,
|
||||||
|
'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
|
||||||
|
'Z_DIM':self.Z_DIM,
|
||||||
|
"X_SPACE_SIZE":self.X_SPACE_SIZE,
|
||||||
|
"D_STRUCTURE":self.D_STRUCTURE,
|
||||||
|
"G_STRUCTURE":self.G_STRUCTURE,
|
||||||
|
"NUM_GPUS":self.NUM_GPUS,
|
||||||
|
"NUM_LABELS":self.NUM_LABELS,
|
||||||
|
"MAX_EPOCHS":self.MAX_EPOCHS,
|
||||||
|
"ROW_COUNT":self.ROW_COUNT
|
||||||
|
}
|
||||||
|
if args and 'key' in args and 'value' in args :
|
||||||
|
key = args['key']
|
||||||
|
value= args['value']
|
||||||
|
object[key] = value
|
||||||
|
_name = os.sep.join([self.out_dir,'meta-'+SYS_ARGS['column']])
|
||||||
|
f = open(_name+'.json','w')
|
||||||
|
f.write(json.dumps(object))
|
||||||
|
def mkdir (self,path):
|
||||||
|
if not os.path.exists(path) :
|
||||||
|
os.mkdir(path)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(self,**args):
|
||||||
|
"""
|
||||||
|
This function will perform a batch normalization on an network layer
|
||||||
|
inputs input layer of the neural network
|
||||||
|
name name of the scope the
|
||||||
|
labels labels (attributes not synthesized) by default None
|
||||||
|
n_labels number of labels default None
|
||||||
|
"""
|
||||||
|
inputs = args['inputs']
|
||||||
|
name = args['name']
|
||||||
|
labels = None if 'labels' not in args else args['labels']
|
||||||
|
n_labels= None if 'n_labels' not in args else args['n_labels']
|
||||||
|
shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
|
||||||
|
mean, var = tf.nn.moments(inputs, shift, keep_dims=True)
|
||||||
|
shape = inputs.shape[1].value
|
||||||
|
offset_m = self.get.variables(shape=[n_labels,shape], name='offset'+name,
|
||||||
|
initializer=tf.zeros_initializer)
|
||||||
|
scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
|
||||||
|
initializer=tf.ones_initializer)
|
||||||
|
|
||||||
|
offset = tf.nn.embedding_lookup(offset_m, labels)
|
||||||
|
scale = tf.nn.embedding_lookup(scale_m, labels)
|
||||||
|
result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _variable_on_cpu(self,**args):
|
||||||
|
"""
|
||||||
|
This function makes sure variables/tensors are not created on the GPU but rather on the CPU
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = args['name']
|
||||||
|
shape = args['shape']
|
||||||
|
initializer=None if 'initializer' not in args else args['initializer']
|
||||||
|
with tf.device('/cpu:0') :
|
||||||
|
cpu_var = tf.compat.v1.get_variable(name,shape,initializer= initializer)
|
||||||
|
return cpu_var
|
||||||
|
def average_gradients(self,tower_grads):
|
||||||
|
average_grads = []
|
||||||
|
for grad_and_vars in zip(*tower_grads):
|
||||||
|
grads = []
|
||||||
|
for g, _ in grad_and_vars:
|
||||||
|
expanded_g = tf.expand_dims(g, 0)
|
||||||
|
grads.append(expanded_g)
|
||||||
|
|
||||||
|
grad = tf.concat(axis=0, values=grads)
|
||||||
|
grad = tf.reduce_mean(grad, 0)
|
||||||
|
|
||||||
|
v = grad_and_vars[0][1]
|
||||||
|
grad_and_var = (grad, v)
|
||||||
|
average_grads.append(grad_and_var)
|
||||||
|
return average_grads
|
||||||
|
|
||||||
|
|
||||||
|
class Generator (GNet):
|
||||||
|
"""
|
||||||
|
This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self,**args):
|
||||||
|
GNet.__init__(self,**args)
|
||||||
|
self.discriminator = Discriminator(**args)
|
||||||
|
def loss(self,**args):
|
||||||
|
fake = args['fake']
|
||||||
|
label = args['label']
|
||||||
|
y_hat_fake = self.discriminator.network(inputs=fake, label=label)
|
||||||
|
all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
|
||||||
|
loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
|
||||||
|
tf.add_to_collection('glosses', loss)
|
||||||
|
return loss, loss
|
||||||
|
def load_meta(self, column):
|
||||||
|
super().load_meta(column)
|
||||||
|
self.discriminator.load_meta(column)
|
||||||
|
def network(self,**args) :
|
||||||
|
"""
|
||||||
|
This function will build the network that will generate the synthetic candidates
|
||||||
|
:inputs matrix of data that we need
|
||||||
|
:dim dimensions of ...
|
||||||
|
"""
|
||||||
|
x = args['inputs']
|
||||||
|
tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
|
||||||
|
label = args['label']
|
||||||
|
|
||||||
|
with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
|
||||||
|
for i, dim in enumerate(self.G_STRUCTURE[:-1]):
|
||||||
|
kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
|
||||||
|
h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
|
||||||
|
h2 = tf.nn.relu(h1)
|
||||||
|
x = x + h2
|
||||||
|
tmp_dim = dim
|
||||||
|
i = len(self.G_STRUCTURE) - 1
|
||||||
|
#
|
||||||
|
# This seems to be an extra hidden layer:
|
||||||
|
# It's goal is to map continuous values to discrete values (pre-trained to do this)
|
||||||
|
kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
|
||||||
|
h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
|
||||||
|
labels=label, n_labels=self.NUM_LABELS)
|
||||||
|
h2 = tf.nn.tanh(h1)
|
||||||
|
x = x + h2
|
||||||
|
# This seems to be the output layer
|
||||||
|
#
|
||||||
|
kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
|
||||||
|
bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
|
||||||
|
x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
|
||||||
|
return x
|
||||||
|
|
||||||
|
class Discriminator(GNet):
|
||||||
|
def __init__(self,**args):
|
||||||
|
GNet.__init__(self,**args)
|
||||||
|
def network(self,**args):
|
||||||
|
"""
|
||||||
|
This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron)
|
||||||
|
:inputs
|
||||||
|
:label
|
||||||
|
"""
|
||||||
|
x = args['inputs']
|
||||||
|
print ()
|
||||||
|
print (x[:3,:])
|
||||||
|
print()
|
||||||
|
label = args['label']
|
||||||
|
with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
|
||||||
|
for i, dim in enumerate(self.D_STRUCTURE[1:]):
|
||||||
|
kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim])
|
||||||
|
bias = self.get.variables(name='b_' + str(i), shape=[dim])
|
||||||
|
print (["\t",bias,kernel])
|
||||||
|
x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
|
||||||
|
x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS)
|
||||||
|
i = len(self.D_STRUCTURE)
|
||||||
|
kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1])
|
||||||
|
bias = self.get.variables(name='b_' + str(i), shape=[1])
|
||||||
|
y = tf.add(tf.matmul(x, kernel), bias)
|
||||||
|
return y
|
||||||
|
|
||||||
|
def loss(self,**args) :
|
||||||
|
"""
|
||||||
|
This function compute the loss of
|
||||||
|
:real
|
||||||
|
:fake
|
||||||
|
:label
|
||||||
|
"""
|
||||||
|
real = args['real']
|
||||||
|
fake = args['fake']
|
||||||
|
label = args['label']
|
||||||
|
epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1)
|
||||||
|
|
||||||
|
x_hat = real + epsilon * (fake - real)
|
||||||
|
y_hat_fake = self.network(inputs=fake, label=label)
|
||||||
|
|
||||||
|
y_hat_real = self.network(inputs=real, label=label)
|
||||||
|
y_hat = self.network(inputs=x_hat, label=label)
|
||||||
|
|
||||||
|
grad = tf.gradients(y_hat, [x_hat])[0]
|
||||||
|
slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
|
||||||
|
gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
|
||||||
|
all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
|
||||||
|
w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
|
||||||
|
loss = w_distance + 10 * gradient_penalty + sum(all_regs)
|
||||||
|
tf.add_to_collection('dlosses', loss)
|
||||||
|
|
||||||
|
return w_distance, loss
|
||||||
|
class Train (GNet):
|
||||||
|
def __init__(self,**args):
|
||||||
|
GNet.__init__(self,**args)
|
||||||
|
self.generator = Generator(**args)
|
||||||
|
self.discriminator = Discriminator(**args)
|
||||||
|
self._REAL = args['real']
|
||||||
|
self._LABEL= args['label']
|
||||||
|
# print ([" *** ",self.BATCHSIZE_PER_GPU])
|
||||||
|
self.log_meta()
|
||||||
|
def load_meta(self, column):
|
||||||
|
"""
|
||||||
|
This function will delegate the calls to load meta data to it's dependents
|
||||||
|
column name
|
||||||
|
"""
|
||||||
|
super().load_meta(column)
|
||||||
|
self.generator.load_meta(column)
|
||||||
|
self.discriminator.load_meta(column)
|
||||||
|
def loss(self,**args):
|
||||||
|
"""
|
||||||
|
This function will compute a "tower" loss of the generated candidate against real data
|
||||||
|
Training will consist in having both generator and discriminators
|
||||||
|
:scope
|
||||||
|
:stage
|
||||||
|
:real
|
||||||
|
:label
|
||||||
|
"""
|
||||||
|
|
||||||
|
scope = args['scope']
|
||||||
|
stage = args['stage']
|
||||||
|
real = args['real']
|
||||||
|
label = args['label']
|
||||||
|
label = tf.cast(label, tf.int32)
|
||||||
|
#
|
||||||
|
# @TODO: Ziqi needs to explain what's going on here
|
||||||
|
m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
|
||||||
|
label = label[:, 1] * len(m) + tf.squeeze(
|
||||||
|
tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
|
||||||
|
)
|
||||||
|
# label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
|
||||||
|
z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
|
||||||
|
|
||||||
|
fake = self.generator.network(inputs=z, label=label)
|
||||||
|
if stage == 'D':
|
||||||
|
w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
|
||||||
|
losses = tf.get_collection('dlosses', scope)
|
||||||
|
else:
|
||||||
|
w, loss = self.generator.loss(fake=fake, label=label)
|
||||||
|
losses = tf.get_collection('glosses', scope)
|
||||||
|
|
||||||
|
total_loss = tf.add_n(losses, name='total_loss')
|
||||||
|
|
||||||
|
return total_loss, w
|
||||||
|
def input_fn(self):
|
||||||
|
"""
|
||||||
|
This function seems to produce
|
||||||
|
"""
|
||||||
|
features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
|
||||||
|
labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
|
||||||
|
dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
|
||||||
|
dataset = dataset.repeat(10000)
|
||||||
|
dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
|
||||||
|
dataset = dataset.prefetch(1)
|
||||||
|
iterator = dataset.make_initializable_iterator()
|
||||||
|
# next_element = iterator.get_next()
|
||||||
|
# init_op = iterator.initializer
|
||||||
|
return iterator, features_placeholder, labels_placeholder
|
||||||
|
|
||||||
|
def network(self,**args):
|
||||||
|
# def graph(stage, opt):
|
||||||
|
# global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
|
||||||
|
stage = args['stage']
|
||||||
|
opt = args['opt']
|
||||||
|
tower_grads = []
|
||||||
|
per_gpu_w = []
|
||||||
|
iterator, features_placeholder, labels_placeholder = self.input_fn()
|
||||||
|
with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
|
||||||
|
for i in range(self.NUM_GPUS):
|
||||||
|
with tf.device('/gpu:%d' % i):
|
||||||
|
with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
|
||||||
|
(real, label) = iterator.get_next()
|
||||||
|
loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
|
||||||
|
tf.get_variable_scope().reuse_variables()
|
||||||
|
vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
|
||||||
|
grads = opt.compute_gradients(loss, vars_)
|
||||||
|
tower_grads.append(grads)
|
||||||
|
per_gpu_w.append(w)
|
||||||
|
|
||||||
|
grads = self.average_gradients(tower_grads)
|
||||||
|
apply_gradient_op = opt.apply_gradients(grads)
|
||||||
|
|
||||||
|
mean_w = tf.reduce_mean(per_gpu_w)
|
||||||
|
train_op = apply_gradient_op
|
||||||
|
return train_op, mean_w, iterator, features_placeholder, labels_placeholder
|
||||||
|
def apply(self,**args):
|
||||||
|
# max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
|
||||||
|
REAL = self._REAL
|
||||||
|
LABEL= self._LABEL
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
|
||||||
|
opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
|
||||||
|
|
||||||
|
train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
|
||||||
|
train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
|
||||||
|
# saver = tf.train.Saver()
|
||||||
|
saver = tf.compat.v1.train.Saver()
|
||||||
|
init = tf.global_variables_initializer()
|
||||||
|
|
||||||
|
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
|
||||||
|
sess.run(init)
|
||||||
|
sess.run(iterator_d.initializer,
|
||||||
|
feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
|
||||||
|
sess.run(iterator_g.initializer,
|
||||||
|
feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
|
||||||
|
|
||||||
|
for epoch in range(1, self.MAX_EPOCHS + 1):
|
||||||
|
start_time = time.time()
|
||||||
|
w_sum = 0
|
||||||
|
for i in range(self.STEPS_PER_EPOCH):
|
||||||
|
for _ in range(2):
|
||||||
|
_, w = sess.run([train_d, w_distance])
|
||||||
|
w_sum += w
|
||||||
|
sess.run(train_g)
|
||||||
|
duration = time.time() - start_time
|
||||||
|
|
||||||
|
assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
|
||||||
|
|
||||||
|
format_str = 'epoch: %d, w_distance = %f (%.1f)'
|
||||||
|
print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
|
||||||
|
if epoch % self.MAX_EPOCHS == 0:
|
||||||
|
|
||||||
|
_name = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']])
|
||||||
|
# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
|
||||||
|
saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
||||||
|
class Predict(GNet):
|
||||||
|
"""
|
||||||
|
This class uses synthetic data given a learned model
|
||||||
|
"""
|
||||||
|
def __init__(self,**args):
|
||||||
|
GNet.__init__(self,**args)
|
||||||
|
self.generator = Generator(**args)
|
||||||
|
self.values = values
|
||||||
|
def load_meta(self, column):
|
||||||
|
super().load_meta(column)
|
||||||
|
self.generator.load_meta(column)
|
||||||
|
def apply(self,**args):
|
||||||
|
# print (self.train_dir)
|
||||||
|
model_dir = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']+'-'+str(self.MAX_EPOCHS)])
|
||||||
|
demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
|
||||||
|
tf.compat.v1.reset_default_graph()
|
||||||
|
z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
|
||||||
|
y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
|
||||||
|
ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
|
||||||
|
label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
|
||||||
|
|
||||||
|
fake = self.generator.network(inputs=z, label=label)
|
||||||
|
init = tf.compat.v1.global_variables_initializer()
|
||||||
|
saver = tf.compat.v1.train.Saver()
|
||||||
|
with tf.compat.v1.Session() as sess:
|
||||||
|
|
||||||
|
# sess.run(init)
|
||||||
|
saver.restore(sess, model_dir)
|
||||||
|
labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
|
||||||
|
|
||||||
|
labels= demo
|
||||||
|
f = sess.run(fake,feed_dict={y:labels})
|
||||||
|
#
|
||||||
|
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
|
||||||
|
#
|
||||||
|
|
||||||
|
df = ( pd.DataFrame(np.round(f).astype(np.int32),columns=values))
|
||||||
|
# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
|
||||||
|
# df = (i * df).sum(axis=1)
|
||||||
|
#
|
||||||
|
# In case we are dealing with actual values like diagnosis codes we can perform
|
||||||
|
#
|
||||||
|
r = np.zeros((self.ROW_COUNT,1))
|
||||||
|
for col in df :
|
||||||
|
i = np.where(df[col])[0]
|
||||||
|
r[i] = col
|
||||||
|
df = pd.DataFrame(r,columns=[self.ATTRIBUTES['synthetic']])
|
||||||
|
|
||||||
|
return df.to_dict(orient='list')
|
||||||
|
# count = str(len(os.listdir(self.out_dir)))
|
||||||
|
# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
|
||||||
|
# df.to_csv(_name,index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# output.extend(np.round(f))
|
||||||
|
|
||||||
|
# for m in range(2):
|
||||||
|
# for n in range(2, self.NUM_LABELS):
|
||||||
|
# idx1 = (demo[:, m] == 1)
|
||||||
|
# idx2 = (demo[:, n] == 1)
|
||||||
|
# idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
|
||||||
|
# num = np.sum(idx)
|
||||||
|
# print ("_____________________")
|
||||||
|
# print (idx1)
|
||||||
|
# print (idx2)
|
||||||
|
# print (idx)
|
||||||
|
# print (num)
|
||||||
|
# print ("_____________________")
|
||||||
|
# nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
|
||||||
|
# label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
|
||||||
|
# label_input[:, n] = 1
|
||||||
|
# label_input[:, m] = 1
|
||||||
|
# output = []
|
||||||
|
# for i in range(nbatch):
|
||||||
|
# f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
|
||||||
|
# output.extend(np.round(f))
|
||||||
|
# output = np.array(output)[:num]
|
||||||
|
# print ([m,n,output])
|
||||||
|
|
||||||
|
# np.save(self.out_dir + str(m) + str(n), output)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__' :
|
||||||
|
#
|
||||||
|
# Now we get things done ...
|
||||||
|
column = SYS_ARGS['column']
|
||||||
|
column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
|
||||||
|
df = pd.read_csv(SYS_ARGS['raw-data'])
|
||||||
|
LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
|
||||||
|
|
||||||
|
context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
|
||||||
|
if set(['train','learn']) & set(SYS_ARGS.keys()):
|
||||||
|
|
||||||
|
df = pd.read_csv(SYS_ARGS['raw-data'])
|
||||||
|
|
||||||
|
# cols = SYS_ARGS['column']
|
||||||
|
# _map,_df = (Binary()).Export(df)
|
||||||
|
# i = np.arange(_map[column]['start'],_map[column]['end'])
|
||||||
|
max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
|
||||||
|
# REAL = _df[:,i]
|
||||||
|
REAL = pd.get_dummies(df[column]).astype(np.float32).values
|
||||||
|
LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
|
||||||
|
trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
|
||||||
|
trainer.apply()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# We should train upon this data
|
||||||
|
#
|
||||||
|
# -- we need to convert the data-frame to binary matrix, given a column
|
||||||
|
#
|
||||||
|
pass
|
||||||
|
elif 'generate' in SYS_ARGS:
|
||||||
|
values = df[column].unique().tolist()
|
||||||
|
values.sort()
|
||||||
|
p = Predict(context=context,label=LABEL,values=values)
|
||||||
|
p.load_meta(column)
|
||||||
|
r = p.apply()
|
||||||
|
print (df)
|
||||||
|
print ()
|
||||||
|
df[column] = r[column]
|
||||||
|
print (df)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print (SYS_ARGS.keys())
|
||||||
|
print (__doc__)
|
||||||
|
pass
|
||||||
|
|
@ -0,0 +1,286 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.contrib.layers import l2_regularizer
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
|
||||||
|
#### id of gpu to use
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
|
||||||
|
|
||||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
||||||
|
|
||||||
|
#### training data
|
||||||
|
#### shape=(n_sample, n_code=854)
|
||||||
|
REAL = np.load('')
|
||||||
|
|
||||||
|
#### demographic for training data
|
||||||
|
#### shape=(n_sample, 6)
|
||||||
|
#### if sample_x is male, then LABEL[x,0]=1, else LABEL[x,1]=1
|
||||||
|
#### if sample_x's is within 0-17, then LABEL[x,2]=1
|
||||||
|
#### elif sample_x's is within 18-44, then LABEL[x,3]=1
|
||||||
|
#### elif sample_x's is within 45-64, then LABEL[x,4]=1
|
||||||
|
#### elif sample_x's is within 64-, then LABEL[x,5]=1
|
||||||
|
LABEL = np.load('')
|
||||||
|
|
||||||
|
#### training parameters
|
||||||
|
NUM_GPUS = 1
|
||||||
|
BATCHSIZE_PER_GPU = 2000
|
||||||
|
TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS
|
||||||
|
STEPS_PER_EPOCH = int(np.load('ICD9/train.npy').shape[0] / 2000)
|
||||||
|
|
||||||
|
g_structure = [128, 128]
|
||||||
|
d_structure = [854, 256, 128]
|
||||||
|
z_dim = 128
|
||||||
|
|
||||||
|
def _variable_on_cpu(name, shape, initializer=None):
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
var = tf.get_variable(name, shape, initializer=initializer)
|
||||||
|
return var
|
||||||
|
|
||||||
|
|
||||||
|
def batchnorm(inputs, name, labels=None, n_labels=None):
|
||||||
|
mean, var = tf.nn.moments(inputs, [0], keep_dims=True)
|
||||||
|
shape = mean.shape[1].value
|
||||||
|
offset_m = _variable_on_cpu(shape=[n_labels,shape], name='offset'+name,
|
||||||
|
initializer=tf.zeros_initializer)
|
||||||
|
scale_m = _variable_on_cpu(shape=[n_labels,shape], name='scale'+name,
|
||||||
|
initializer=tf.ones_initializer)
|
||||||
|
offset = tf.nn.embedding_lookup(offset_m, labels)
|
||||||
|
scale = tf.nn.embedding_lookup(scale_m, labels)
|
||||||
|
result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def layernorm(inputs, name, labels=None, n_labels=None):
|
||||||
|
mean, var = tf.nn.moments(inputs, [1], keep_dims=True)
|
||||||
|
shape = inputs.shape[1].value
|
||||||
|
offset_m = _variable_on_cpu(shape=[n_labels,shape], name='offset'+name,
|
||||||
|
initializer=tf.zeros_initializer)
|
||||||
|
scale_m = _variable_on_cpu(shape=[n_labels,shape], name='scale'+name,
|
||||||
|
initializer=tf.ones_initializer)
|
||||||
|
offset = tf.nn.embedding_lookup(offset_m, labels)
|
||||||
|
scale = tf.nn.embedding_lookup(scale_m, labels)
|
||||||
|
result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def input_fn():
|
||||||
|
features_placeholder = tf.placeholder(shape=REAL.shape, dtype=tf.float32)
|
||||||
|
labels_placeholder = tf.placeholder(shape=LABEL.shape, dtype=tf.float32)
|
||||||
|
dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
|
||||||
|
dataset = dataset.repeat(10000)
|
||||||
|
dataset = dataset.batch(batch_size=BATCHSIZE_PER_GPU)
|
||||||
|
dataset = dataset.prefetch(1)
|
||||||
|
iterator = dataset.make_initializable_iterator()
|
||||||
|
# next_element = iterator.get_next()
|
||||||
|
# init_op = iterator.initializer
|
||||||
|
return iterator, features_placeholder, labels_placeholder
|
||||||
|
|
||||||
|
|
||||||
|
def generator(z, label):
|
||||||
|
x = z
|
||||||
|
tmp_dim = z_dim
|
||||||
|
with tf.variable_scope('G', reuse=tf.AUTO_REUSE, regularizer=l2_regularizer(0.00001)):
|
||||||
|
for i, dim in enumerate(g_structure[:-1]):
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[tmp_dim, dim])
|
||||||
|
h1 = batchnorm(tf.matmul(x, kernel), name='cbn' + str(i), labels=label, n_labels=8)
|
||||||
|
h2 = tf.nn.relu(h1)
|
||||||
|
x = x + h2
|
||||||
|
tmp_dim = dim
|
||||||
|
i = len(g_structure) - 1
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[tmp_dim, g_structure[-1]])
|
||||||
|
h1 = batchnorm(tf.matmul(x, kernel), name='cbn' + str(i),
|
||||||
|
labels=label, n_labels=8)
|
||||||
|
h2 = tf.nn.tanh(h1)
|
||||||
|
x = x + h2
|
||||||
|
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i+1), shape=[128, 854])
|
||||||
|
bias = _variable_on_cpu('b_' + str(i+1), shape=[854])
|
||||||
|
x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def discriminator(x, label):
|
||||||
|
with tf.variable_scope('D', reuse=tf.AUTO_REUSE, regularizer=l2_regularizer(0.00001)):
|
||||||
|
for i, dim in enumerate(d_structure[1:]):
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[d_structure[i], dim])
|
||||||
|
bias = _variable_on_cpu('b_' + str(i), shape=[dim])
|
||||||
|
x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
|
||||||
|
x = layernorm(x, name='cln' + str(i), labels=label, n_labels=8)
|
||||||
|
i = len(d_structure)
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[d_structure[-1], 1])
|
||||||
|
bias = _variable_on_cpu('b_' + str(i), shape=[1])
|
||||||
|
y = tf.add(tf.matmul(x, kernel), bias)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def compute_dloss(real, fake, label):
|
||||||
|
epsilon = tf.random_uniform(
|
||||||
|
shape=[BATCHSIZE_PER_GPU, 1],
|
||||||
|
minval=0.,
|
||||||
|
maxval=1.)
|
||||||
|
x_hat = real + epsilon * (fake - real)
|
||||||
|
y_hat_fake = discriminator(fake, label)
|
||||||
|
y_hat_real = discriminator(real, label)
|
||||||
|
y_hat = discriminator(x_hat, label)
|
||||||
|
|
||||||
|
grad = tf.gradients(y_hat, [x_hat])[0]
|
||||||
|
slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
|
||||||
|
gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
|
||||||
|
all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
|
||||||
|
w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
|
||||||
|
loss = w_distance + 10 * gradient_penalty + sum(all_regs)
|
||||||
|
tf.add_to_collection('dlosses', loss)
|
||||||
|
|
||||||
|
return w_distance, loss
|
||||||
|
|
||||||
|
|
||||||
|
def compute_gloss(fake, label):
|
||||||
|
y_hat_fake = discriminator(fake, label)
|
||||||
|
all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
|
||||||
|
loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
|
||||||
|
tf.add_to_collection('glosses', loss)
|
||||||
|
return loss, loss
|
||||||
|
|
||||||
|
|
||||||
|
def tower_loss(scope, stage, real, label):
|
||||||
|
label = tf.cast(label, tf.int32)
|
||||||
|
label = label[:, 1] * 4 + tf.squeeze(
|
||||||
|
tf.matmul(label[:, 2:], tf.constant([[0], [1], [2], [3]], dtype=tf.int32)))
|
||||||
|
z = tf.random_normal(shape=[BATCHSIZE_PER_GPU, z_dim])
|
||||||
|
fake = generator(z, label)
|
||||||
|
if stage == 'D':
|
||||||
|
w, loss = compute_dloss(real, fake, label)
|
||||||
|
losses = tf.get_collection('dlosses', scope)
|
||||||
|
else:
|
||||||
|
w, loss = compute_gloss(fake, label)
|
||||||
|
losses = tf.get_collection('glosses', scope)
|
||||||
|
|
||||||
|
total_loss = tf.add_n(losses, name='total_loss')
|
||||||
|
|
||||||
|
# loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
|
||||||
|
# loss_averages_op = loss_averages.apply(losses + [total_loss])
|
||||||
|
#
|
||||||
|
# with tf.control_dependencies([loss_averages_op]):
|
||||||
|
# total_loss = tf.identity(total_loss)
|
||||||
|
|
||||||
|
return total_loss, w
|
||||||
|
|
||||||
|
|
||||||
|
def average_gradients(tower_grads):
|
||||||
|
average_grads = []
|
||||||
|
for grad_and_vars in zip(*tower_grads):
|
||||||
|
grads = []
|
||||||
|
for g, _ in grad_and_vars:
|
||||||
|
expanded_g = tf.expand_dims(g, 0)
|
||||||
|
grads.append(expanded_g)
|
||||||
|
|
||||||
|
grad = tf.concat(axis=0, values=grads)
|
||||||
|
grad = tf.reduce_mean(grad, 0)
|
||||||
|
|
||||||
|
v = grad_and_vars[0][1]
|
||||||
|
grad_and_var = (grad, v)
|
||||||
|
average_grads.append(grad_and_var)
|
||||||
|
return average_grads
|
||||||
|
|
||||||
|
|
||||||
|
def graph(stage, opt):
|
||||||
|
# global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
|
||||||
|
tower_grads = []
|
||||||
|
per_gpu_w = []
|
||||||
|
iterator, features_placeholder, labels_placeholder = input_fn()
|
||||||
|
with tf.variable_scope(tf.get_variable_scope()):
|
||||||
|
for i in range(NUM_GPUS):
|
||||||
|
with tf.device('/gpu:%d' % i):
|
||||||
|
with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
|
||||||
|
(real, label) = iterator.get_next()
|
||||||
|
loss, w = tower_loss(scope, stage, real, label)
|
||||||
|
tf.get_variable_scope().reuse_variables()
|
||||||
|
vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
|
||||||
|
grads = opt.compute_gradients(loss, vars_)
|
||||||
|
tower_grads.append(grads)
|
||||||
|
per_gpu_w.append(w)
|
||||||
|
|
||||||
|
grads = average_gradients(tower_grads)
|
||||||
|
apply_gradient_op = opt.apply_gradients(grads)
|
||||||
|
|
||||||
|
mean_w = tf.reduce_mean(per_gpu_w)
|
||||||
|
train_op = apply_gradient_op
|
||||||
|
return train_op, mean_w, iterator, features_placeholder, labels_placeholder
|
||||||
|
|
||||||
|
|
||||||
|
def train(max_epochs, train_dir):
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
opt_d = tf.train.AdamOptimizer(1e-4)
|
||||||
|
opt_g = tf.train.AdamOptimizer(1e-4)
|
||||||
|
train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = graph('D', opt_d)
|
||||||
|
train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = graph('G', opt_g)
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
init = tf.global_variables_initializer()
|
||||||
|
|
||||||
|
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
|
||||||
|
sess.run(init)
|
||||||
|
sess.run(iterator_d.initializer,
|
||||||
|
feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
|
||||||
|
sess.run(iterator_g.initializer,
|
||||||
|
feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
|
||||||
|
|
||||||
|
for epoch in range(1, max_epochs + 1):
|
||||||
|
start_time = time.time()
|
||||||
|
w_sum = 0
|
||||||
|
for i in range(STEPS_PER_EPOCH):
|
||||||
|
for _ in range(2):
|
||||||
|
_, w = sess.run([train_d, w_distance])
|
||||||
|
w_sum += w
|
||||||
|
sess.run(train_g)
|
||||||
|
duration = time.time() - start_time
|
||||||
|
|
||||||
|
assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
|
||||||
|
|
||||||
|
format_str = 'epoch: %d, w_distance = %f (%.1f)'
|
||||||
|
print(format_str % (epoch, -w_sum/(STEPS_PER_EPOCH*2), duration))
|
||||||
|
if epoch % 500 == 0:
|
||||||
|
# checkpoint_path = os.path.join(train_dir, 'multi')
|
||||||
|
saver.save(sess, train_dir, write_meta_graph=False, global_step=epoch)
|
||||||
|
# saver.save(sess, train_dir, global_step=epoch)
|
||||||
|
|
||||||
|
|
||||||
|
def generate(model_dir, synthetic_dir, demo):
|
||||||
|
tf.reset_default_graph()
|
||||||
|
z = tf.random_normal(shape=[BATCHSIZE_PER_GPU, z_dim])
|
||||||
|
y = tf.placeholder(shape=[BATCHSIZE_PER_GPU, 6], dtype=tf.int32)
|
||||||
|
label = y[:, 1] * 4 + tf.squeeze(tf.matmul(y[:, 2:], tf.constant([[0], [1], [2], [3]], dtype=tf.int32)))
|
||||||
|
fake = generator(z, label)
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
with tf.Session() as sess:
|
||||||
|
saver.restore(sess, model_dir)
|
||||||
|
for m in range(2):
|
||||||
|
for n in range(2, 6):
|
||||||
|
idx1 = (demo[:, m] == 1)
|
||||||
|
idx2 = (demo[:, n] == 1)
|
||||||
|
idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
|
||||||
|
num = np.sum(idx)
|
||||||
|
nbatch = int(np.ceil(num / BATCHSIZE_PER_GPU))
|
||||||
|
label_input = np.zeros((nbatch*BATCHSIZE_PER_GPU, 6))
|
||||||
|
label_input[:, n] = 1
|
||||||
|
label_input[:, m] = 1
|
||||||
|
output = []
|
||||||
|
for i in range(nbatch):
|
||||||
|
f = sess.run(fake,feed_dict={y: label_input[i*BATCHSIZE_PER_GPU:(i+1)*BATCHSIZE_PER_GPU]})
|
||||||
|
output.extend(np.round(f))
|
||||||
|
output = np.array(output)[:num]
|
||||||
|
np.save(synthetic_dir + str(m) + str(n), output)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
#### args_1: number of training epochs
|
||||||
|
#### args_2: dir to save the trained model
|
||||||
|
train(500, '')
|
||||||
|
|
||||||
|
#### args_1: dir of trained model
|
||||||
|
#### args_2: dir to save synthetic data
|
||||||
|
#### args_3, label of data-to-be-generated
|
||||||
|
generate('', '', demo=LABEL)
|
||||||
|
|
@ -0,0 +1,18 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
SYS_ARGS = {'context':''}
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
|
||||||
|
N = len(sys.argv)
|
||||||
|
for i in range(1,N):
|
||||||
|
value = None
|
||||||
|
if sys.argv[i].startswith('--'):
|
||||||
|
key = sys.argv[i][2:] #.replace('-','')
|
||||||
|
SYS_ARGS[key] = 1
|
||||||
|
if i + 1 < N:
|
||||||
|
value = sys.argv[i + 1] = sys.argv[i+1].strip()
|
||||||
|
if key and value:
|
||||||
|
SYS_ARGS[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
i += 2
|
Binary file not shown.
@ -0,0 +1,287 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.contrib.layers import l2_regularizer
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||||
|
# os.environ['CUDA_VISIBLE_DEVICES'] = "4,5"
|
||||||
|
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
||||||
|
|
||||||
|
|
||||||
|
FLAGS = tf.app.flags.FLAGS
|
||||||
|
|
||||||
|
tf.app.flags.DEFINE_string('train_dir', 'google_cloud_test/',
|
||||||
|
"""Directory where to store checkpoint. """)
|
||||||
|
tf.app.flags.DEFINE_string('save_dir', 'google_cloud_test/',
|
||||||
|
"""Directory where to save generated data. """)
|
||||||
|
tf.app.flags.DEFINE_integer('max_steps', 100,
|
||||||
|
"""Number of batches to run in each epoch.""")
|
||||||
|
tf.app.flags.DEFINE_integer('max_epochs', 100,
|
||||||
|
"""Number of epochs to run.""")
|
||||||
|
tf.app.flags.DEFINE_integer('batchsize', 10,
|
||||||
|
"""Batchsize.""")
|
||||||
|
tf.app.flags.DEFINE_integer('z_dim', 10,
|
||||||
|
"""Dimensionality of random input.""")
|
||||||
|
tf.app.flags.DEFINE_integer('data_dim', 30,
|
||||||
|
"""Dimensionality of data.""")
|
||||||
|
tf.app.flags.DEFINE_integer('demo_dim', 8,
|
||||||
|
"""Dimensionality of demographics.""")
|
||||||
|
tf.app.flags.DEFINE_float('reg', 0.0001,
|
||||||
|
"""L2 regularization.""")
|
||||||
|
|
||||||
|
g_structure = [FLAGS.z_dim, FLAGS.z_dim]
|
||||||
|
d_structure = [FLAGS.data_dim, int(FLAGS.data_dim/2), FLAGS.z_dim]
|
||||||
|
|
||||||
|
|
||||||
|
def _variable_on_cpu(name, shape, initializer=None):
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
var = tf.get_variable(name, shape, initializer=initializer)
|
||||||
|
return var
|
||||||
|
|
||||||
|
|
||||||
|
def batchnorm(inputs, name, labels=None, n_labels=None):
|
||||||
|
mean, var = tf.nn.moments(inputs, [0], keep_dims=True)
|
||||||
|
shape = mean.shape[1].value
|
||||||
|
offset_m = _variable_on_cpu(shape=[n_labels,shape], name='offset'+name,
|
||||||
|
initializer=tf.zeros_initializer)
|
||||||
|
scale_m = _variable_on_cpu(shape=[n_labels,shape], name='scale'+name,
|
||||||
|
initializer=tf.ones_initializer)
|
||||||
|
offset = tf.nn.embedding_lookup(offset_m, labels)
|
||||||
|
scale = tf.nn.embedding_lookup(scale_m, labels)
|
||||||
|
result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def layernorm(inputs, name, labels=None, n_labels=None):
|
||||||
|
mean, var = tf.nn.moments(inputs, [1], keep_dims=True)
|
||||||
|
shape = inputs.shape[1].value
|
||||||
|
offset_m = _variable_on_cpu(shape=[n_labels,shape], name='offset'+name,
|
||||||
|
initializer=tf.zeros_initializer)
|
||||||
|
scale_m = _variable_on_cpu(shape=[n_labels,shape], name='scale'+name,
|
||||||
|
initializer=tf.ones_initializer)
|
||||||
|
offset = tf.nn.embedding_lookup(offset_m, labels)
|
||||||
|
scale = tf.nn.embedding_lookup(scale_m, labels)
|
||||||
|
result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def input_fn():
|
||||||
|
features_placeholder = tf.placeholder(shape=[None, FLAGS.data_dim], dtype=tf.float32)
|
||||||
|
labels_placeholder = tf.placeholder(shape=[None, 6], dtype=tf.float32)
|
||||||
|
dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
|
||||||
|
dataset = dataset.repeat(10000)
|
||||||
|
dataset = dataset.batch(batch_size=FLAGS.batchsize)
|
||||||
|
dataset = dataset.prefetch(1)
|
||||||
|
iterator = dataset.make_initializable_iterator()
|
||||||
|
return iterator, features_placeholder, labels_placeholder
|
||||||
|
|
||||||
|
|
||||||
|
def generator(z, label):
|
||||||
|
x = z
|
||||||
|
tmp_dim = FLAGS.z_dim
|
||||||
|
with tf.variable_scope('G', reuse=tf.AUTO_REUSE, regularizer=l2_regularizer(FLAGS.reg)):
|
||||||
|
for i, dim in enumerate(g_structure[:-1]):
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[tmp_dim, dim])
|
||||||
|
h1 = batchnorm(tf.matmul(x, kernel), name='cbn' + str(i), labels=label, n_labels=FLAGS.demo_dim)
|
||||||
|
h2 = tf.nn.relu(h1)
|
||||||
|
x = x + h2
|
||||||
|
tmp_dim = dim
|
||||||
|
i = len(g_structure) - 1
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[tmp_dim, g_structure[-1]])
|
||||||
|
h1 = batchnorm(tf.matmul(x, kernel), name='cbn' + str(i),
|
||||||
|
labels=label, n_labels=FLAGS.demo_dim)
|
||||||
|
h2 = tf.nn.tanh(h1)
|
||||||
|
x = x + h2
|
||||||
|
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i+1), shape=[FLAGS.z_dim, FLAGS.data_dim])
|
||||||
|
bias = _variable_on_cpu('b_' + str(i+1), shape=[FLAGS.data_dim])
|
||||||
|
x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def discriminator(x, label):
|
||||||
|
with tf.variable_scope('D', reuse=tf.AUTO_REUSE, regularizer=l2_regularizer(FLAGS.reg)):
|
||||||
|
for i, dim in enumerate(d_structure[1:]):
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[d_structure[i], dim])
|
||||||
|
bias = _variable_on_cpu('b_' + str(i), shape=[dim])
|
||||||
|
x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
|
||||||
|
x = layernorm(x, name='cln' + str(i), labels=label, n_labels=FLAGS.demo_dim)
|
||||||
|
i = len(d_structure)
|
||||||
|
kernel = _variable_on_cpu('W_' + str(i), shape=[d_structure[-1], 1])
|
||||||
|
bias = _variable_on_cpu('b_' + str(i), shape=[1])
|
||||||
|
y = tf.add(tf.matmul(x, kernel), bias)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def compute_dloss(real, fake, label):
|
||||||
|
epsilon = tf.random_uniform(
|
||||||
|
shape=[FLAGS.batchsize, 1],
|
||||||
|
minval=0.,
|
||||||
|
maxval=1.)
|
||||||
|
x_hat = real + epsilon * (fake - real)
|
||||||
|
y_hat_fake = discriminator(fake, label)
|
||||||
|
y_hat_real = discriminator(real, label)
|
||||||
|
y_hat = discriminator(x_hat, label)
|
||||||
|
|
||||||
|
grad = tf.gradients(y_hat, [x_hat])[0]
|
||||||
|
slopes = tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
|
||||||
|
gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
|
||||||
|
all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
|
||||||
|
w_distance = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)+sum(all_regs)
|
||||||
|
loss = w_distance + 10 * gradient_penalty
|
||||||
|
tf.add_to_collection('dlosses', loss)
|
||||||
|
|
||||||
|
return w_distance, loss
|
||||||
|
|
||||||
|
|
||||||
|
def compute_gloss(fake, label):
|
||||||
|
y_hat_fake = discriminator(fake, label)
|
||||||
|
all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
|
||||||
|
loss = -tf.reduce_mean(y_hat_fake)+sum(all_regs)
|
||||||
|
tf.add_to_collection('glosses', loss)
|
||||||
|
return loss, loss
|
||||||
|
|
||||||
|
|
||||||
|
def tower_loss(scope, stage, real, label):
|
||||||
|
label = tf.cast(label, tf.int32)
|
||||||
|
print ([stage,label.shape])
|
||||||
|
label = label[:, 1] * 4 + tf.squeeze(
|
||||||
|
tf.matmul(label[:, 2:], tf.constant([[0], [1], [2], [3]], dtype=tf.int32)))
|
||||||
|
z = tf.random_normal(shape=[FLAGS.batchsize, FLAGS.z_dim])
|
||||||
|
fake = generator(z, label)
|
||||||
|
if stage == 'D':
|
||||||
|
w, loss = compute_dloss(real, fake, label)
|
||||||
|
losses = tf.get_collection('dlosses', scope)
|
||||||
|
else:
|
||||||
|
w, loss = compute_gloss(fake, label)
|
||||||
|
losses = tf.get_collection('glosses', scope)
|
||||||
|
|
||||||
|
total_loss = tf.add_n(losses, name='total_loss')
|
||||||
|
return total_loss, w
|
||||||
|
|
||||||
|
|
||||||
|
def average_gradients(tower_grads):
|
||||||
|
average_grads = []
|
||||||
|
for grad_and_vars in zip(*tower_grads):
|
||||||
|
grads = []
|
||||||
|
for g, _ in grad_and_vars:
|
||||||
|
expanded_g = tf.expand_dims(g, 0)
|
||||||
|
grads.append(expanded_g)
|
||||||
|
|
||||||
|
grad = tf.concat(axis=0, values=grads)
|
||||||
|
grad = tf.reduce_mean(grad, 0)
|
||||||
|
|
||||||
|
v = grad_and_vars[0][1]
|
||||||
|
grad_and_var = (grad, v)
|
||||||
|
average_grads.append(grad_and_var)
|
||||||
|
return average_grads
|
||||||
|
|
||||||
|
|
||||||
|
def graph(stage, opt):
|
||||||
|
tower_grads = []
|
||||||
|
per_gpu_w = []
|
||||||
|
iterator, features_placeholder, labels_placeholder = input_fn()
|
||||||
|
with tf.variable_scope(tf.get_variable_scope()):
|
||||||
|
for i in range(1):
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
|
||||||
|
(real, label) = iterator.get_next()
|
||||||
|
|
||||||
|
loss, w = tower_loss(scope, stage, real, label)
|
||||||
|
tf.get_variable_scope().reuse_variables()
|
||||||
|
vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
|
||||||
|
grads = opt.compute_gradients(loss, vars_)
|
||||||
|
tower_grads.append(grads)
|
||||||
|
per_gpu_w.append(w)
|
||||||
|
|
||||||
|
grads = average_gradients(tower_grads)
|
||||||
|
apply_gradient_op = opt.apply_gradients(grads)
|
||||||
|
|
||||||
|
mean_w = tf.reduce_mean(per_gpu_w)
|
||||||
|
train_op = apply_gradient_op
|
||||||
|
return train_op, mean_w, iterator, features_placeholder, labels_placeholder
|
||||||
|
|
||||||
|
|
||||||
|
def train(data, demo):
|
||||||
|
with tf.device('/cpu:0'):
|
||||||
|
opt_d = tf.train.AdamOptimizer(1e-4)
|
||||||
|
opt_g = tf.train.AdamOptimizer(1e-4)
|
||||||
|
train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = graph('D', opt_d)
|
||||||
|
train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = graph('G', opt_g)
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
init = tf.global_variables_initializer()
|
||||||
|
|
||||||
|
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
|
||||||
|
sess.run(init)
|
||||||
|
sess.run(iterator_d.initializer,
|
||||||
|
feed_dict={features_placeholder_d: data,
|
||||||
|
labels_placeholder_d: demo})
|
||||||
|
sess.run(iterator_g.initializer,
|
||||||
|
feed_dict={features_placeholder_g: data,
|
||||||
|
labels_placeholder_g: demo})
|
||||||
|
|
||||||
|
for epoch in range(1, FLAGS.max_epochs + 1):
|
||||||
|
start_time = time.time()
|
||||||
|
w_sum = 0
|
||||||
|
for i in range(FLAGS.max_steps):
|
||||||
|
for _ in range(2):
|
||||||
|
_, w = sess.run([train_d, w_distance])
|
||||||
|
w_sum += w
|
||||||
|
sess.run(train_g)
|
||||||
|
duration = time.time() - start_time
|
||||||
|
|
||||||
|
assert not np.isnan(w_sum), 'Model diverged with loss = NaN'
|
||||||
|
|
||||||
|
format_str = 'epoch: %d, w_distance = %f (%.1f)'
|
||||||
|
print(format_str % (epoch, -w_sum/(FLAGS.max_steps*2), duration))
|
||||||
|
if epoch % FLAGS.max_epochs == 0:
|
||||||
|
# checkpoint_path = os.path.join(train_dir, 'multi')
|
||||||
|
saver.save(sess, FLAGS.train_dir + 'emr_wgan', write_meta_graph=False, global_step=epoch)
|
||||||
|
# saver.save(sess, train_dir, global_step=epoch)
|
||||||
|
|
||||||
|
|
||||||
|
def generate(demo):
|
||||||
|
z = tf.random_normal(shape=[FLAGS.batchsize, FLAGS.z_dim])
|
||||||
|
y = tf.placeholder(shape=[FLAGS.batchsize, 6], dtype=tf.int32)
|
||||||
|
label = y[:, 1] * 4 + tf.squeeze(tf.matmul(y[:, 2:], tf.constant([[0], [1], [2], [3]], dtype=tf.int32)))
|
||||||
|
fake = generator(z, label)
|
||||||
|
saver = tf.train.Saver()
|
||||||
|
with tf.Session() as sess:
|
||||||
|
saver.restore(sess, FLAGS.train_dir + 'emr_wgan-' + str(FLAGS.max_epochs))
|
||||||
|
for m in range(2):
|
||||||
|
for n in range(2, 6):
|
||||||
|
idx1 = (demo[:, m] == 1)
|
||||||
|
idx2 = (demo[:, n] == 1)
|
||||||
|
idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
|
||||||
|
num = np.sum(idx)
|
||||||
|
nbatch = int(np.ceil(num / FLAGS.batchsize))
|
||||||
|
label_input = np.zeros((nbatch*FLAGS.batchsize, 6))
|
||||||
|
label_input[:, n] = 1
|
||||||
|
label_input[:, m] = 1
|
||||||
|
output = []
|
||||||
|
for i in range(nbatch):
|
||||||
|
f = sess.run(fake,feed_dict={y: label_input[i*FLAGS.batchsize:(i+1)*FLAGS.batchsize]})
|
||||||
|
output.extend(np.round(f))
|
||||||
|
output = np.array(output)[:num]
|
||||||
|
np.save(FLAGS.save_dir + 'synthetic_' + str(m) + str(n), output)
|
||||||
|
|
||||||
|
|
||||||
|
def load_data():
|
||||||
|
data = np.zeros(3000)
|
||||||
|
idx = np.random.choice(np.arange(3000),size=900)
|
||||||
|
data[idx] = 1
|
||||||
|
data = np.reshape(data, (100,30))
|
||||||
|
idx = np.random.randint(2,6,size=100)
|
||||||
|
idx2 = np.random.randint(2,size=100)
|
||||||
|
demo = np.zeros((100,6))
|
||||||
|
demo[np.arange(100), idx] = 1
|
||||||
|
demo[np.arange(100), idx2] = 1
|
||||||
|
return data, demo
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
data, demo = load_data()
|
||||||
|
print ([data.shape,demo.shape])
|
||||||
|
train(data, demo)
|
||||||
|
# generate(demo)
|
||||||
|
|
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"type": "service_account",
|
||||||
|
"project_id": "aou-res-deid-vumc-test",
|
||||||
|
"private_key_id": "8b7acef9a1f1137799011cf13cf0906e331c472e",
|
||||||
|
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCYRPv0ZMGLXjva\nVZjJlcApDpXhJl2iDghhG0JqUH1PmuLjMtmhuMSgweq+M3KNF92Wft9Ree+fTN6m\nVtyqZMgz1qXi6I1WJHyT+ndtk4eWlE4O1AxE0QkfLqtj1kafU6Yu2tGpZ23jHFG9\nc7oq1tqPwC39pKE3ScShcpbZxFqvOFwW7ZSHEQ2Zk0/9lA0bfQH+Vaq1JqBbMkCO\nh1p1ptXPHyIoTjgbtQ/3N6JHA9XpqF1DHFQTe6H/4Zc+GUBV8kb/9pdeybcrhd1K\nVzuT6pAkOLQ7Wtq9Hwl3zAF3jyhlEpirYt4tjcw1pq0phhUuDGcLS37cTzWkqekr\nFEp8NkSnAgMBAAECggEAI16Kw+cPigb2ki2l0tVlEGRh7i2SPE1UJvJFCBrwMKiC\noVGzebxIeCrzEwEyT5HGl+mah/tx7KfXY/3zPeUxF9F5MO7hvau2AE2CpkJJkXGb\nfBhHTUjc/JBDoWopd2LfzCxp3Ra4ULPITOBv0vmbRR7Xz/4IsKYC9Zl/btAMXHy4\nJZZuifK8mCD4BDXxG6W2p+jqeKFjKYTuHyCKWy9u8NnnH6eoNMLvewr/P3pPZK9l\nSFQDV0nWU0yZoR4cccYHtq/9Uw1pY7A9iNYI4JnAnPam8Rka0OEgZbqMVsk3FUmA\nG+SOtiJ9iopQsW5g/HTG7Q420gijnfe5IWQK6yLBOQKBgQDNCuGexHMUGB+/bxFK\nnQ+AiktFib76PbMYFSGdsQQYHGcNHXmXRnJbpj/llO7tiWk/akOA0UrjtipXERTP\nYoXRDlghvnluxUYDm+mD94jSe7rE45b+sNH8FyqgrHWJVHSPBcIz0YXCUxRmE9eq\n4BcNfTqtjAl7hasWhGUVlXppawKBgQC+HJn1Lpvp89h+7ge09p6SU6RhAbOygrtA\nBD3Odr6WV6SGXEKyFHSHLkRVA1BFzzTXl3nEJvHFe7I5RNnVzWSqmf4LkBcIDqQO\nmiNb2TbA/h4utlMJvTrit03qdzngvgmoWyKqNpxmj6afNU/up4ck0hqBkJae/FBQ\nkoSwXcA0tQKBgDJzE/JZiasPCHi0nj+Kh27sF/sjGj8+ARvSzzOag1RfYKekce9b\noPWV4TDexS7i2WeGANfoJxICF0bW6BTiu+QlMGAVGpG7ri9jJECZHiwTz290RAmk\nffYVySJBbKX+hrNOCmtviQa4JFO9XBoqCuIBxvc+dnLS/7aJmsmFvtnDAoGAfQRf\n9gzdeN7i+q1bIhSfuIgKa8RrwDMaIgHoBxKtSD6AMd8P+P1cl9zEEMeqDQ4yqKey\n6lvV19D9JY3yVhfIYCv+FOp/Sswd9IBGSkswJ3+0p3E8cAYhaB+0vEAFLpap0S2F\nQTvCY+uJXd74Hm/KflswFQ3ZDtnLkwCXA0fTcpUCgYBMkcE6Bn0tIShaXsaaufIW\nXrJ6gtEUDtUXP85lNO7hUxBWTu2dF6OsgBniNfWypmRecaZsFl/sD6YKT0bV1vvv\nU0uhYTDx5z7o8ahvjBwOqF5sDDVX02umFBoG16zd3hpOJrGSh+ESpJhWw5dV6m5J\n530zPFObyt2kI9+E75+G/w==\n-----END PRIVATE KEY-----\n",
|
||||||
|
"client_email": "dev-deid-600@aou-res-deid-vumc-test.iam.gserviceaccount.com",
|
||||||
|
"client_id": "104228831510203920964",
|
||||||
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||||
|
"token_uri": "https://oauth2.googleapis.com/token",
|
||||||
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||||
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/dev-deid-600%40aou-res-deid-vumc-test.iam.gserviceaccount.com"
|
||||||
|
}
|
Loading…
Reference in new issue