initial commit
This commit is contained in:
commit
f04dc7e19d
9 changed files with 7340 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
deploy
|
||||
README.md
|
14
app/api-pipelines.yaml
Normal file
14
app/api-pipelines.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
apiVersion: "unified-platform.cs.hse.ru/v1"
|
||||
kind: APIComponent
|
||||
metadata:
|
||||
name: api-pipelines
|
||||
namespace: pu-mgalynchik-pa-mm25-synthdata
|
||||
spec:
|
||||
published: true
|
||||
pipelines:
|
||||
enabled: true
|
||||
restfulApi:
|
||||
auth:
|
||||
basic:
|
||||
credentials: synth-apis-cred
|
||||
identityPassThrough: true
|
16
app/box.yaml
Normal file
16
app/box.yaml
Normal file
|
@ -0,0 +1,16 @@
|
|||
apiVersion: "unified-platform.cs.hse.ru/v1"
|
||||
kind: DataBox
|
||||
metadata:
|
||||
name: synthdata-box
|
||||
namespace: pu-mgalynchik-pa-mm25-synthdata
|
||||
spec:
|
||||
s3DefaultStorage: {}
|
||||
---
|
||||
|
||||
apiVersion: "unified-platform.cs.hse.ru/v1"
|
||||
kind: DataBox
|
||||
metadata:
|
||||
name: users
|
||||
namespace: pu-mgalynchik-pa-mm25-synthdata
|
||||
spec:
|
||||
s3DefaultStorage: {}
|
14
app/files-api.yaml
Normal file
14
app/files-api.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
apiVersion: "unified-platform.cs.hse.ru/v1"
|
||||
kind: APIComponent
|
||||
metadata:
|
||||
name: files-api
|
||||
namespace: pu-mgalynchik-pa-mm25-synthdata
|
||||
spec:
|
||||
published: true
|
||||
files:
|
||||
enabled: true
|
||||
restfulApi:
|
||||
auth:
|
||||
basic:
|
||||
credentials: synth-apis-cred
|
||||
identityPassThrough: true
|
29
app/syntdata-test-api.yaml
Normal file
29
app/syntdata-test-api.yaml
Normal file
|
@ -0,0 +1,29 @@
|
|||
|
||||
apiVersion: "unified-platform.cs.hse.ru/v1"
|
||||
kind: APIComponent
|
||||
metadata:
|
||||
name: train-generate-api
|
||||
namespace: pu-mgalynchik-pa-mm25-synthdata
|
||||
spec:
|
||||
published: true
|
||||
experimentPipeline:
|
||||
name: train-generate
|
||||
restfulApi:
|
||||
auth:
|
||||
basic:
|
||||
credentials: synth-apis-cred
|
||||
identityPassThrough: true
|
||||
apiSpec:
|
||||
inputs:
|
||||
- name: input_data
|
||||
description: "Путь до папки с датасетом для обучения"
|
||||
type:
|
||||
datatypes: [ "FILE" ]
|
||||
contentTypes: [ "text/csv" ]
|
||||
outputs:
|
||||
- name: report_file
|
||||
description: "Путь до папки куда пайплайн сохранит отчет об обучении"
|
||||
- name: output_data
|
||||
description: "Путь до папки куда пайплайн сохранит сгенерированные данные"
|
||||
- name: model
|
||||
description: "Путь до папки куда пайплайн сохранит модель"
|
35
app/syntdata-test-pipe.yaml
Normal file
35
app/syntdata-test-pipe.yaml
Normal file
|
@ -0,0 +1,35 @@
|
|||
apiVersion: "unified-platform.cs.hse.ru/v1"
|
||||
kind: ExperimentPipeline
|
||||
metadata:
|
||||
name: train-generate
|
||||
namespace: pu-mgalynchik-pa-mm25-synthdata
|
||||
spec:
|
||||
vars:
|
||||
- name: model
|
||||
- name: input_data
|
||||
- name: report_file
|
||||
- name: output_data
|
||||
stages:
|
||||
- name: synth-data-stage
|
||||
image:
|
||||
existingImageName: registry.platform-dev.stratpro.hse.ru/mm25-synthdata/synt_data_stage:c157543
|
||||
inputs:
|
||||
- name: input_data
|
||||
path: /unip/data/synthesize_data/input
|
||||
outputs:
|
||||
- name: report_file
|
||||
path: /unip/results/synthesize_data/report
|
||||
- name: output_data
|
||||
path: /unip/results/synthesize_data/output
|
||||
- name: model
|
||||
path: /unip/results/synthesize_data/model
|
||||
resourceLimits:
|
||||
cpu: "800m"
|
||||
memory: "4G"
|
||||
connectedBoxes:
|
||||
- name: synthdata-box
|
||||
path: /unip
|
||||
default: true
|
||||
mountS3Box:
|
||||
s3BoxName: synthdata-box
|
||||
|
7044
data/telecom_data.csv
Normal file
7044
data/telecom_data.csv
Normal file
File diff suppressed because it is too large
Load diff
183
mm25-module-card.md
Normal file
183
mm25-module-card.md
Normal file
|
@ -0,0 +1,183 @@
|
|||
### Проверка работы с модулем 25 через API
|
||||
|
||||
#### Обучение модели и генерация данных
|
||||
|
||||
##### Файловый API
|
||||
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
from requests.auth import HTTPBasicAuth
|
||||
import time
|
||||
import datetime
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
module_name = "pu-mgalynchik-pa-mm25-synthdata"
|
||||
username = ""
|
||||
password = ""
|
||||
REQUESTS_TIMEOUT = 20
|
||||
|
||||
basic_auth = HTTPBasicAuth(username, password)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
service_url = f"https://platform-dev-cs-hse.objectoriented.ru/{module_name}"
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Путь к файлу с входными данными, как локально, так и в файловом хранилище фреймворка
|
||||
filename = "data/telecom_data.csv"
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
response = requests.put(f"{service_url}/files/synthdata-box/{filename}", auth=basic_auth)
|
||||
if response.status_code in [400, ]:
|
||||
response = requests.get(f"{service_url}/files/synthdata-box/{filename}", auth=basic_auth)
|
||||
display(response.json())
|
||||
```
|
||||
|
||||
|
||||
{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/data/telecom_data.csv',
|
||||
'presigned_put_url': 'https://storage.yandexcloud.net/platform-default-user-data/pu-mgalynchik-pa-mm25-synthdata/synthdata-box/users/developer/file_groups/data/telecom_data.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=YCAJEw7KWeJzGZz9pXuFdhLPP%2F20241201%2Fru-central1%2Fs3%2Faws4_request&X-Amz-Date=20241201T185235Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=e816ff92cb18b85f8f65e6c6536f9757271e12f28afb2c1dfc12616b462ce640'}
|
||||
|
||||
|
||||
|
||||
```python
|
||||
if response.status_code in [200, 201]:
|
||||
result_urls = json.loads(response.content)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
response = requests.put(result_urls["presigned_put_url"], data=f.read())
|
||||
print(response.status_code)
|
||||
```
|
||||
|
||||
200
|
||||
|
||||
|
||||
##### Запуск пайплайна генерации данных
|
||||
|
||||
|
||||
```python
|
||||
pipeline_name = "train-generate"
|
||||
full_url = f"https://platform-dev-cs-hse.objectoriented.ru/{module_name}/pipelines/{pipeline_name}/trials"
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
synthdata_request = {
|
||||
"inputs": [
|
||||
{
|
||||
"name": "input_data",
|
||||
"datatype": "FILE",
|
||||
"content_type": "text/csv",
|
||||
"shape": [7043, 20],
|
||||
"data": "data/"
|
||||
}
|
||||
],
|
||||
"output_vars": [
|
||||
{
|
||||
"name": "report_file",
|
||||
"data": "results/syntesize_data/report/"
|
||||
},
|
||||
{
|
||||
"name": "output_data",
|
||||
"data": "results/syntesize_data/output/"
|
||||
},
|
||||
{
|
||||
"name": "model",
|
||||
"data": "results/syntesize_data/model/"
|
||||
},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
response = requests.post(full_url, headers=headers, auth=basic_auth, json=synthdata_request)
|
||||
tracking_url = json.loads(response.content)['_links']['self']['href']
|
||||
|
||||
if response.status_code in [200, ]:
|
||||
tracking_info = requests.get(tracking_url, auth=basic_auth)
|
||||
last_status = json.loads(tracking_info.content)['status']['conditions'][-1]
|
||||
status_history = [last_status]
|
||||
while last_status['type'] != 'Completed':
|
||||
if last_status not in status_history:
|
||||
status_history.append(last_status)
|
||||
|
||||
time.sleep(REQUESTS_TIMEOUT)
|
||||
|
||||
tracking_info = requests.get(tracking_url, auth=basic_auth)
|
||||
last_status = json.loads(tracking_info.content)['status']['conditions'][-1]
|
||||
|
||||
status_conditions = json.loads(tracking_info.content)['status']['conditions']
|
||||
start_time = datetime.datetime.strptime(status_conditions[0]['last_transition_time'][:-5], '%Y-%m-%dT%H:%M:%S.%f')
|
||||
for idx, status in enumerate(status_conditions):
|
||||
status_time = datetime.datetime.strptime(status['last_transition_time'][:-5], '%Y-%m-%dT%H:%M:%S.%f')
|
||||
print(f"[{str(status_time - start_time).split('.', maxsplit=1)[0]}] Pipeline job status - {status['type']}")
|
||||
|
||||
display(last_status)
|
||||
```
|
||||
|
||||
[0:00:00] Pipeline job status - Started
|
||||
[0:02:34] Pipeline job status - OutputsAreValidated
|
||||
[0:02:36] Pipeline job status - Completed
|
||||
|
||||
|
||||
|
||||
{'type': 'Completed',
|
||||
'condition_status': 'True',
|
||||
'last_transition_time': '2024-12-01T18:55:13.943784+0000',
|
||||
'message': '',
|
||||
'reason': '',
|
||||
'stage': None}
|
||||
|
||||
|
||||
##### Результаты
|
||||
|
||||
|
||||
```python
|
||||
for filename in synthdata_request["output_vars"]:
|
||||
response = requests.get(f"{service_url}/files/synthdata-box/{filename['data']}", auth=basic_auth)
|
||||
if response.status_code in [200, 201]:
|
||||
display(response.json())
|
||||
```
|
||||
|
||||
|
||||
{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/results/syntesize_data/report/',
|
||||
'files': [{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/results/syntesize_data/report/report.json',
|
||||
'presigned_get_url': 'https://storage.yandexcloud.net/platform-default-user-data/pu-mgalynchik-pa-mm25-synthdata/synthdata-box/users/developer/file_groups/results/syntesize_data/report/report.json?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=YCAJEw7KWeJzGZz9pXuFdhLPP%2F20241201%2Fru-central1%2Fs3%2Faws4_request&X-Amz-Date=20241201T185519Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=58608407ac224691cacd571fe7f5edb3b80ccb1fbb25632b2d3cad6be541d049'}]}
|
||||
|
||||
|
||||
|
||||
{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/results/syntesize_data/output/',
|
||||
'files': [{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/results/syntesize_data/output/prediction.csv',
|
||||
'presigned_get_url': 'https://storage.yandexcloud.net/platform-default-user-data/pu-mgalynchik-pa-mm25-synthdata/synthdata-box/users/developer/file_groups/results/syntesize_data/output/prediction.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=YCAJEw7KWeJzGZz9pXuFdhLPP%2F20241201%2Fru-central1%2Fs3%2Faws4_request&X-Amz-Date=20241201T185519Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=5346314f79abfb31563be22b814a1286035cbb0d19f782f626166651b5691036'}]}
|
||||
|
||||
|
||||
|
||||
{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/results/syntesize_data/model/',
|
||||
'files': [{'name': 'pu-mgalynchik-pa-mm25-synthdata/files/synthdata-box/results/syntesize_data/model/model.joblib',
|
||||
'presigned_get_url': 'https://storage.yandexcloud.net/platform-default-user-data/pu-mgalynchik-pa-mm25-synthdata/synthdata-box/users/developer/file_groups/results/syntesize_data/model/model.joblib?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=YCAJEw7KWeJzGZz9pXuFdhLPP%2F20241201%2Fru-central1%2Fs3%2Faws4_request&X-Amz-Date=20241201T185519Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=41d8ded4948d817538aa65a12b6cfc028d7df5e0af963e5ce7c84d509c458b06'}]}
|
||||
|
||||
|
||||
##### Процент совпадений с входными данными:
|
||||
|
||||
|
||||
```python
|
||||
report_fname = synthdata_request["output_vars"][0]['data']
|
||||
response = requests.get(f"{service_url}/files/synthdata-box/{report_fname}", auth=basic_auth)
|
||||
if response.status_code in [200, 201]:
|
||||
response = requests.get(response.json()['files'][0]['presigned_get_url'], auth=basic_auth)
|
||||
display(response.json())
|
||||
```
|
||||
|
||||
|
||||
{'Overall Quality Score': 0.9413369301611234}
|
||||
|
3
results/report/report.json
Normal file
3
results/report/report.json
Normal file
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"Overall Quality Score": 0.9456371017271323
|
||||
}
|
Loading…
Reference in a new issue