I have come up with this code but it is not yet correct (and of course I am not even sure at this point that I am doing something sensible in the first place):
def create_dev_test_split(path_to_profiles, output_dir, test_ratio, db_schema):
# Split item ids into dev and test:
dev_ids, test_ids = split_ids(path_to_profiles, test_ratio)
# Create a temporary empty text file:
with open(output_dir + '/temp.txt', 'w') as f:
f.write('')
commands.mkprof(output_dir + '/dev/', source=output_dir + '/temp.txt', schema=db_schema)
commands.mkprof(output_dir + '/test/', source=output_dir + '/temp.txt', schema=db_schema)
dev_profile = itsdb.TestSuite(output_dir + '/dev/')
test_profile = itsdb.TestSuite(output_dir + '/test/')
for ts_path in glob.glob(path_to_profiles + '/*'):
ts = itsdb.TestSuite(ts_path)
for i,item in enumerate(ts['item']):
if item['i-id'] in dev_ids:
copy_from_db(dev_profile, item, ts_path)
elif item['i-id'] in test_ids:
copy_from_db(test_profile, item, ts_path)
else:
print('Item in neither set.')
dev_profile.commit()
test_profile.commit()
def copy_from_db(profile, item, ts_path):
profile['item'].append(item)
q_parse = '* from parse where i-id = ' + str(item['i-id'])
selection_parse = commands.select(q_parse, ts_path)
# The non-empty files in the original database:
related_tables = ['run', 'decision', 'edge', 'preference', 'result', 'tree']
for sdp in selection_parse.data:
r = itsdb.Row(selection_parse.fields, sdp)
parse_id = r['parse-id']
profile['parse'].append(r)
for rt in related_tables:
q = '* from ' + rt + ' where parse-id = ' + str(parse_id)
selection = commands.select(q, ts_path)
for sd in selection.data:
rs = itsdb.Row(selection.fields, sd)
profile[rt].append(rs)
However, even though I am using the same relations
file as in the original test suite, in the end I cannot write the new test suite out because of a mismatch in fields:
Traceback (most recent call last):
File "/home/olga/delphin/GAUSS/gauss-repo/venv/lib/python3.8/site-packages/delphin/tsdb.py", line 855, in write
(join(record, fields) + '\n').encode(encoding))
File "/home/olga/delphin/GAUSS/gauss-repo/venv/lib/python3.8/site-packages/delphin/tsdb.py", line 492, in join
_mismatched_counts(values, fields)
File "/home/olga/delphin/GAUSS/gauss-repo/venv/lib/python3.8/site-packages/delphin/tsdb.py", line 502, in _mismatched_counts
raise TSDBError('number of columns ({}) != number of fields ({})'
delphin.tsdb.TSDBError: number of columns (23) != number of fields (21)
Upon inspection, the difference is the following:
But at this point I rather suspect I am doing something that I am not supposed to be doing, anyway… It’s just that I have treebanked profiles which I would like to rearrange but I would really like to avoid having to retreebank them.