Rudimentary optimization with APM/QA.

Added script 'apm_quality_assessment_optimize' for finding parameters
that minimize a custom function of the scores generated by APM-QA. The
script reuses the existing functionality for filtering the data on
configs/scores/outputs.

To archieve that, some modularization has been done: the part from
apm_quality_assessment_export that reads in data into a
pandas.DataFrame has been moved into quality_assessment.collect_data.

TESTED = though extensive manual tests. Unit tests for the user
scripts and 'collect_data' are missing, because we don't have a test
framework for loading/exporting fake data.

BUG=webrtc:7218

Change-Id: I5521b952970243da05fc4db1b9feef87a2e5ccad
Reviewed-on: https://chromium-review.googlesource.com/643292
Commit-Queue: Alex Loiko <aleloi@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#19780}
This commit is contained in:
Alex Loiko
2017-09-11 17:56:20 +02:00
committed by Commit Bot
parent 3b3c9c4eb0
commit 357429dd1e
6 changed files with 441 additions and 235 deletions

View File

@ -24,6 +24,7 @@ copy("scripts") {
"apm_quality_assessment.sh",
"apm_quality_assessment_export.py",
"apm_quality_assessment_gencfgs.py",
"apm_quality_assessment_optimize.py",
]
outputs = [
"$root_build_dir/py_quality_assessment/{{source_file_part}}",
@ -53,6 +54,7 @@ copy("lib") {
sources = [
"quality_assessment/__init__.py",
"quality_assessment/audioproc_wrapper.py",
"quality_assessment/collect_data.py",
"quality_assessment/data_access.py",
"quality_assessment/echo_path_simulation.py",
"quality_assessment/echo_path_simulation_factory.py",

View File

@ -81,7 +81,7 @@ export separate reports. In this case, you can use the
For instance:
```
$ ./apm_quality_assessment-export.py \
$ ./apm_quality_assessment_export.py \
-o output/ \
-c "(^default$)|(.*AE.*)" \
-t \(white_noise\) \

View File

@ -11,142 +11,13 @@
HTML file.
"""
import argparse
import logging
import glob
import os
import re
import sys
try:
import pandas as pd
except ImportError:
logging.critical('Cannot import the third-party Python package pandas')
sys.exit(1)
import quality_assessment.data_access as data_access
import apm_quality_assessment_collect_data as collect_data
import quality_assessment.export as export
import quality_assessment.simulation as sim
# Compiled regular expressions used to extract score descriptors.
RE_CONFIG_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixApmConfig() + r'(.+)')
RE_CAPTURE_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixCapture() + r'(.+)')
RE_RENDER_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixRender() + r'(.+)')
RE_ECHO_SIM_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixEchoSimulator() + r'(.+)')
RE_TEST_DATA_GEN_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixTestDataGenerator() + r'(.+)')
RE_TEST_DATA_GEN_PARAMS = re.compile(
sim.ApmModuleSimulator.GetPrefixTestDataGeneratorParameters() + r'(.+)')
RE_SCORE_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixScore() + r'(.+)(\..+)')
def _InstanceArgumentsParser():
"""Arguments parser factory.
"""
parser = argparse.ArgumentParser(description=(
'Exports pre-computed APM module quality assessment results into HTML '
'tables.'))
parser.add_argument('-o', '--output_dir', required=True,
help=('the same base path used with the '
'apm_quality_assessment tool'))
parser.add_argument('-f', '--filename_suffix',
help=('suffix of the exported file'))
parser.add_argument('-c', '--config_names', type=re.compile,
help=('regular expression to filter the APM configuration'
' names'))
parser.add_argument('-i', '--capture_names', type=re.compile,
help=('regular expression to filter the capture signal '
'names'))
parser.add_argument('-r', '--render_names', type=re.compile,
help=('regular expression to filter the render signal '
'names'))
parser.add_argument('-e', '--echo_simulator_names', type=re.compile,
help=('regular expression to filter the echo simulator '
'names'))
parser.add_argument('-t', '--test_data_generators', type=re.compile,
help=('regular expression to filter the test data '
'generator names'))
parser.add_argument('-s', '--eval_scores', type=re.compile,
help=('regular expression to filter the evaluation score '
'names'))
return parser
def _GetScoreDescriptors(score_filepath):
"""Extracts a score descriptor from the given score file path.
Args:
score_filepath: path to the score file.
Returns:
A tuple of strings (APM configuration name, capture audio track name,
render audio track name, echo simulator name, test data generator name,
test data generator parameters as string, evaluation score name).
"""
fields = score_filepath.split(os.sep)[-7:]
extract_name = lambda index, reg_expr: (
reg_expr.match(fields[index]).groups(0)[0])
return (
extract_name(0, RE_CONFIG_NAME),
extract_name(1, RE_CAPTURE_NAME),
extract_name(2, RE_RENDER_NAME),
extract_name(3, RE_ECHO_SIM_NAME),
extract_name(4, RE_TEST_DATA_GEN_NAME),
extract_name(5, RE_TEST_DATA_GEN_PARAMS),
extract_name(6, RE_SCORE_NAME),
)
def _ExcludeScore(config_name, capture_name, render_name, echo_simulator_name,
test_data_gen_name, score_name, args):
"""Decides whether excluding a score.
A set of optional regular expressions in args is used to determine if the
score should be excluded (depending on its |*_name| descriptors).
Args:
config_name: APM configuration name.
capture_name: capture audio track name.
render_name: render audio track name.
echo_simulator_name: echo simulator name.
test_data_gen_name: test data generator name.
score_name: evaluation score name.
args: parsed arguments.
Returns:
A boolean.
"""
value_regexpr_pairs = [
(config_name, args.config_names),
(capture_name, args.capture_names),
(render_name, args.render_names),
(echo_simulator_name, args.echo_simulator_names),
(test_data_gen_name, args.test_data_generators),
(score_name, args.eval_scores),
]
# Score accepted if each value matches the corresponding regular expression.
for value, regexpr in value_regexpr_pairs:
if regexpr is None:
continue
if not regexpr.match(value):
return True
return False
def _BuildOutputFilename(filename_suffix):
@ -162,111 +33,18 @@ def _BuildOutputFilename(filename_suffix):
return 'results.html'
return 'results-{}.html'.format(filename_suffix)
def _FindScores(src_path, args):
"""Given a search path, find scores and return a DataFrame object.
Args:
src_path: Search path pattern.
args: parsed arguments.
Returns:
A DataFrame object.
"""
# Get scores.
scores = []
for score_filepath in glob.iglob(src_path):
# Extract score descriptor fields from the path.
(config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
test_data_gen_params,
score_name) = _GetScoreDescriptors(score_filepath)
# Ignore the score if required.
if _ExcludeScore(
config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
score_name,
args):
logging.info(
'ignored score: %s %s %s %s %s %s',
config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
score_name)
continue
# Read metadata and score.
metadata = data_access.Metadata.LoadAudioTestDataPaths(
os.path.split(score_filepath)[0])
score = data_access.ScoreFile.Load(score_filepath)
# Add a score with its descriptor fields.
scores.append((
metadata['clean_capture_input_filepath'],
metadata['echo_free_capture_filepath'],
metadata['echo_filepath'],
metadata['render_filepath'],
metadata['capture_filepath'],
metadata['apm_output_filepath'],
metadata['apm_reference_filepath'],
config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
test_data_gen_params,
score_name,
score,
))
return pd.DataFrame(
data=scores,
columns=(
'clean_capture_input_filepath',
'echo_free_capture_filepath',
'echo_filepath',
'render_filepath',
'capture_filepath',
'apm_output_filepath',
'apm_reference_filepath',
'apm_config',
'capture',
'render',
'echo_simulator',
'test_data_gen',
'test_data_gen_params',
'eval_score_name',
'score',
))
def main():
# Init.
logging.basicConfig(level=logging.DEBUG) # TODO(alessio): INFO once debugged.
parser = _InstanceArgumentsParser()
parser = collect_data.InstanceArgumentsParser()
parser.description = ('Exports pre-computed APM module quality assessment '
'results into HTML tables')
args = parser.parse_args()
# Get the scores.
src_path = os.path.join(
args.output_dir,
sim.ApmModuleSimulator.GetPrefixApmConfig() + '*',
sim.ApmModuleSimulator.GetPrefixCapture() + '*',
sim.ApmModuleSimulator.GetPrefixRender() + '*',
sim.ApmModuleSimulator.GetPrefixEchoSimulator() + '*',
sim.ApmModuleSimulator.GetPrefixTestDataGenerator() + '*',
sim.ApmModuleSimulator.GetPrefixTestDataGeneratorParameters() + '*',
sim.ApmModuleSimulator.GetPrefixScore() + '*')
src_path = collect_data.ConstructSrcPath(args)
logging.debug(src_path)
scores_data_frame = _FindScores(src_path, args)
scores_data_frame = collect_data.FindScores(src_path, args)
# Export.
output_filepath = os.path.join(args.output_dir, _BuildOutputFilename(

View File

@ -0,0 +1,179 @@
#!/usr/bin/env python
# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
"""Finds the APM configuration that maximizes a provided metric by
parsing the output generated apm_quality_assessment.py.
"""
from __future__ import division
import collections
import logging
import os
import quality_assessment.data_access as data_access
import quality_assessment.collect_data as collect_data
def _InstanceArgumentsParser():
"""Arguments parser factory. Extends the arguments from 'collect_data'
with a few extra for selecting what parameters to optimize for.
"""
parser = collect_data.InstanceArgumentsParser()
parser.description = (
'Rudimentary optimization of a function over different parameter'
'combinations.')
parser.add_argument('-n', '--config_dir', required=False,
help=('path to the folder with the configuration files'),
default='apm_configs')
parser.add_argument('-p', '--params', required=True, nargs='+',
help=('parameters to parse from the config files in'
'config_dir'))
parser.add_argument('-z', '--params_not_to_optimize', required=False,
nargs='+', default=[],
help=('parameters from `params` not to be optimized for'))
return parser
def _ConfigurationAndScores(data_frame, params,
params_not_to_optimize, config_dir):
"""Returns a list of all configurations and scores.
Args:
data_frame: A pandas data frame with the scores and config name
returned by _FindScores.
params: The parameter names to parse from configs the config
directory
params_not_to_optimize: The parameter names which shouldn't affect
the optimal parameter
selection. E.g., fixed settings and not
tunable parameters.
config_dir: Path to folder with config files.
Returns:
Dictionary of the form
{param_combination: [{params: {param1: value1, ...},
scores: {score1: value1, ...}}]}.
The key `param_combination` runs over all parameter combinations
of the parameters in `params` and not in
`params_not_to_optimize`. A corresponding value is a list of all
param combinations for params in `params_not_to_optimize` and
their scores.
"""
results = collections.defaultdict(list)
config_names = data_frame['apm_config'].drop_duplicates().values.tolist()
score_names = data_frame['eval_score_name'].drop_duplicates().values.tolist()
# Normalize the scores
normalization_constants = {}
for score_name in score_names:
scores = data_frame[data_frame.eval_score_name == score_name].score
normalization_constants[score_name] = max(scores)
params_to_optimize = [p for p in params if p not in params_not_to_optimize]
param_combination = collections.namedtuple("ParamCombination",
params_to_optimize)
for config_name in config_names:
config_json = data_access.AudioProcConfigFile.Load(
os.path.join(config_dir, config_name + ".json"))
scores = {}
data_cell = data_frame[data_frame.apm_config == config_name]
for score_name in score_names:
data_cell_scores = data_cell[data_cell.eval_score_name ==
score_name].score
scores[score_name] = sum(data_cell_scores) / len(data_cell_scores)
scores[score_name] /= normalization_constants[score_name]
result = {'scores': scores, 'params': {}}
config_optimize_params = {}
for param in params:
if param in params_to_optimize:
config_optimize_params[param] = config_json['-' + param]
else:
result['params'][param] = config_json['-' + param]
current_param_combination = param_combination( # pylint: disable=star-args
**config_optimize_params)
results[current_param_combination].append(result)
return results
def _FindOptimalParameter(configs_and_scores, score_weighting):
"""Finds the config producing the maximal score.
Args:
configs_and_scores: structure of the form returned by
_ConfigurationAndScores
score_weighting: a function to weight together all score values of
the form [{params: {param1: value1, ...}, scores:
{score1: value1, ...}}] into a numeric
value
Returns:
the config that has the largest values of |score_weighting| applied
to its scores.
"""
min_score = float('+inf')
best_params = None
for config in configs_and_scores:
scores_and_params = configs_and_scores[config]
current_score = score_weighting(scores_and_params)
if current_score < min_score:
min_score = current_score
best_params = config
logging.debug("Score: %f", current_score)
logging.debug("Config: %s", str(config))
return best_params
def _ExampleWeighting(scores_and_configs):
"""Example argument to `_FindOptimalParameter`
Args:
scores_and_configs: a list of configs and scores, in the form
described in _FindOptimalParameter
Returns:
numeric value, the sum of all scores
"""
res = 0
for score_config in scores_and_configs:
res += sum(score_config['scores'].values())
return res
def main():
# Init.
# TODO(alessiob): INFO once debugged.
logging.basicConfig(level=logging.DEBUG)
parser = _InstanceArgumentsParser()
args = parser.parse_args()
# Get the scores.
src_path = collect_data.ConstructSrcPath(args)
logging.debug('Src path <%s>', src_path)
scores_data_frame = collect_data.FindScores(src_path, args)
all_scores = _ConfigurationAndScores(scores_data_frame,
args.params,
args.params_not_to_optimize,
args.config_dir)
opt_param = _FindOptimalParameter(all_scores, _ExampleWeighting)
logging.info('Optimal parameter combination: <%s>', opt_param)
logging.info('It\'s score values: <%s>', all_scores[opt_param])
if __name__ == "__main__":
main()

View File

@ -0,0 +1,244 @@
# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
"""Imports a filtered subset of the scores and configurations computed
by apm_quality_assessment.py into a pandas data frame.
"""
import argparse
import glob
import logging
import os
import re
import sys
try:
import pandas as pd
except ImportError:
logging.critical('Cannot import the third-party Python package pandas')
sys.exit(1)
from . import data_access as data_access
from . import simulation as sim
# Compiled regular expressions used to extract score descriptors.
RE_CONFIG_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixApmConfig() + r'(.+)')
RE_CAPTURE_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixCapture() + r'(.+)')
RE_RENDER_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixRender() + r'(.+)')
RE_ECHO_SIM_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixEchoSimulator() + r'(.+)')
RE_TEST_DATA_GEN_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixTestDataGenerator() + r'(.+)')
RE_TEST_DATA_GEN_PARAMS = re.compile(
sim.ApmModuleSimulator.GetPrefixTestDataGeneratorParameters() + r'(.+)')
RE_SCORE_NAME = re.compile(
sim.ApmModuleSimulator.GetPrefixScore() + r'(.+)(\..+)')
def InstanceArgumentsParser():
"""Arguments parser factory.
"""
parser = argparse.ArgumentParser(description=(
'Override this description in a user script by changing'
' `parser.description` of the returned parser.'))
parser.add_argument('-o', '--output_dir', required=True,
help=('the same base path used with the '
'apm_quality_assessment tool'))
parser.add_argument('-f', '--filename_suffix',
help=('suffix of the exported file'))
parser.add_argument('-c', '--config_names', type=re.compile,
help=('regular expression to filter the APM configuration'
' names'))
parser.add_argument('-i', '--capture_names', type=re.compile,
help=('regular expression to filter the capture signal '
'names'))
parser.add_argument('-r', '--render_names', type=re.compile,
help=('regular expression to filter the render signal '
'names'))
parser.add_argument('-e', '--echo_simulator_names', type=re.compile,
help=('regular expression to filter the echo simulator '
'names'))
parser.add_argument('-t', '--test_data_generators', type=re.compile,
help=('regular expression to filter the test data '
'generator names'))
parser.add_argument('-s', '--eval_scores', type=re.compile,
help=('regular expression to filter the evaluation score '
'names'))
return parser
def _GetScoreDescriptors(score_filepath):
"""Extracts a score descriptor from the given score file path.
Args:
score_filepath: path to the score file.
Returns:
A tuple of strings (APM configuration name, capture audio track name,
render audio track name, echo simulator name, test data generator name,
test data generator parameters as string, evaluation score name).
"""
fields = score_filepath.split(os.sep)[-7:]
extract_name = lambda index, reg_expr: (
reg_expr.match(fields[index]).groups(0)[0])
return (
extract_name(0, RE_CONFIG_NAME),
extract_name(1, RE_CAPTURE_NAME),
extract_name(2, RE_RENDER_NAME),
extract_name(3, RE_ECHO_SIM_NAME),
extract_name(4, RE_TEST_DATA_GEN_NAME),
extract_name(5, RE_TEST_DATA_GEN_PARAMS),
extract_name(6, RE_SCORE_NAME),
)
def _ExcludeScore(config_name, capture_name, render_name, echo_simulator_name,
test_data_gen_name, score_name, args):
"""Decides whether excluding a score.
A set of optional regular expressions in args is used to determine if the
score should be excluded (depending on its |*_name| descriptors).
Args:
config_name: APM configuration name.
capture_name: capture audio track name.
render_name: render audio track name.
echo_simulator_name: echo simulator name.
test_data_gen_name: test data generator name.
score_name: evaluation score name.
args: parsed arguments.
Returns:
A boolean.
"""
value_regexpr_pairs = [
(config_name, args.config_names),
(capture_name, args.capture_names),
(render_name, args.render_names),
(echo_simulator_name, args.echo_simulator_names),
(test_data_gen_name, args.test_data_generators),
(score_name, args.eval_scores),
]
# Score accepted if each value matches the corresponding regular expression.
for value, regexpr in value_regexpr_pairs:
if regexpr is None:
continue
if not regexpr.match(value):
return True
return False
def FindScores(src_path, args):
"""Given a search path, find scores and return a DataFrame object.
Args:
src_path: Search path pattern.
args: parsed arguments.
Returns:
A DataFrame object.
"""
# Get scores.
scores = []
for score_filepath in glob.iglob(src_path):
# Extract score descriptor fields from the path.
(config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
test_data_gen_params,
score_name) = _GetScoreDescriptors(score_filepath)
# Ignore the score if required.
if _ExcludeScore(
config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
score_name,
args):
logging.info(
'ignored score: %s %s %s %s %s %s',
config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
score_name)
continue
# Read metadata and score.
metadata = data_access.Metadata.LoadAudioTestDataPaths(
os.path.split(score_filepath)[0])
score = data_access.ScoreFile.Load(score_filepath)
# Add a score with its descriptor fields.
scores.append((
metadata['clean_capture_input_filepath'],
metadata['echo_free_capture_filepath'],
metadata['echo_filepath'],
metadata['render_filepath'],
metadata['capture_filepath'],
metadata['apm_output_filepath'],
metadata['apm_reference_filepath'],
config_name,
capture_name,
render_name,
echo_simulator_name,
test_data_gen_name,
test_data_gen_params,
score_name,
score,
))
return pd.DataFrame(
data=scores,
columns=(
'clean_capture_input_filepath',
'echo_free_capture_filepath',
'echo_filepath',
'render_filepath',
'capture_filepath',
'apm_output_filepath',
'apm_reference_filepath',
'apm_config',
'capture',
'render',
'echo_simulator',
'test_data_gen',
'test_data_gen_params',
'eval_score_name',
'score',
))
def ConstructSrcPath(args):
return os.path.join(
args.output_dir,
sim.ApmModuleSimulator.GetPrefixApmConfig() + '*',
sim.ApmModuleSimulator.GetPrefixCapture() + '*',
sim.ApmModuleSimulator.GetPrefixRender() + '*',
sim.ApmModuleSimulator.GetPrefixEchoSimulator() + '*',
sim.ApmModuleSimulator.GetPrefixTestDataGenerator() + '*',
sim.ApmModuleSimulator.GetPrefixTestDataGeneratorParameters() + '*',
sim.ApmModuleSimulator.GetPrefixScore() + '*')

View File

@ -6,6 +6,7 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
import functools
import hashlib
import os
import re
@ -79,7 +80,8 @@ class HtmlExport(object):
def _BuildBody(self):
"""Builds the content of the <body> section."""
score_names = self._scores_data_frame.eval_score_name.unique().tolist()
score_names = self._scores_data_frame['eval_score_name'].drop_duplicates(
).values.tolist()
html = [
('<div class="mdl-layout mdl-js-layout mdl-layout--fixed-header '
@ -178,7 +180,7 @@ class HtmlExport(object):
score_name + test_data_gen + test_data_gen_params + apm_config)
if stats['count'] == 1:
# Show the only available score.
item_id = hashlib.md5(items_id_prefix).hexdigest()
item_id = hashlib.md5(items_id_prefix.encode('utf-8')).hexdigest()
html.append('<div id="single-value-{0}">{1:f}</div>'.format(
item_id, scores['score'].mean()))
html.append('<div class="mdl-tooltip" data-mdl-for="single-value-{}">{}'
@ -186,7 +188,8 @@ class HtmlExport(object):
else:
# Show stats.
for stat_name in ['min', 'max', 'mean', 'std dev']:
item_id = hashlib.md5(items_id_prefix + stat_name).hexdigest()
item_id = hashlib.md5(
(items_id_prefix + stat_name).encode('utf-8')).hexdigest()
html.append('<div id="stats-{0}">{1:f}</div>'.format(
item_id, stats[stat_name]))
html.append('<div class="mdl-tooltip" data-mdl-for="stats-{}">{}'
@ -289,7 +292,7 @@ class HtmlExport(object):
masks.append(self._scores_data_frame.test_data_gen == test_data_gen)
masks.append(
self._scores_data_frame.test_data_gen_params == test_data_gen_params)
mask = reduce((lambda i1, i2: i1 & i2), masks)
mask = functools.reduce((lambda i1, i2: i1 & i2), masks)
del masks
return self._scores_data_frame[mask]
@ -302,7 +305,7 @@ class HtmlExport(object):
masks.append(scores.capture == capture)
masks.append(scores.render == render)
masks.append(scores.echo_simulator == echo_simulator)
mask = reduce((lambda i1, i2: i1 & i2), masks)
mask = functools.reduce((lambda i1, i2: i1 & i2), masks)
del masks
sliced_data = scores[mask]
@ -333,7 +336,7 @@ class HtmlExport(object):
return 'score-stats-dialog-' + hashlib.md5(
'score-stats-inspector-{}-{}-{}-{}'.format(
score_name, apm_config, test_data_gen,
test_data_gen_params)).hexdigest()
test_data_gen_params).encode('utf-8')).hexdigest()
@classmethod
def _Save(cls, output_filepath, html):