pubs2020.bib

@inproceedings{stoller2020sequnetmodelling,
  author = {Stoller, D and Tian, M and Ewert, S and Dixon, S},
  booktitle = {},
  month = {Jul},
  pages = {2893--2900},
  title = {Seq-U-Net: A One-Dimensional Causal U-Net for Efficient Sequence Modelling},
  year = {2020},
  doi = {10.24963/ijcai.2020/400},
  day = {1}
}
@inproceedings{demirel2020automaticselfattention,
  author = {Demirel, E and Ahlback, S and DIxon, S},
  booktitle = {Proceedings of the International Joint Conference on Neural Networks},
  month = {Jul},
  title = {Automatic Lyrics Transcription using Dilated Convolutional Neural Networks with Self-Attention},
  year = {2020},
  abstract = {© 2020 IEEE. Speech recognition is a well developed research field so that the current state of the art systems are being used in many applications in the software industry, yet as by today, there still does not exist such robust system for the recognition of words and sentences from singing voice. This paper proposes a complete pipeline for this task which may commonly be referred as automatic lyrics transcription (ALT). We have trained convolutional time-delay neural networks with self-attention on monophonic karaoke recordings using a sequence classification objective for building the acoustic model. The dataset used in this study, DAMP - Sing! 300x30x2 [1] is filtered to have songs with only English lyrics. Different language models are tested including MaxEnt and Recurrent Neural Networks based methods which are trained on the lyrics of pop songs in English. An in-depth analysis of the self-attention mechanism is held while tuning its context width and the number of attention heads. Using the best settings, our system achieves notable improvement to the state-of-the-art in ALT and provides a new baseline for the task.},
  doi = {10.1109/IJCNN48605.2020.9207052},
  isbn = {9781728169262},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{hanlon2020theextractor,
  author = {Hanlon, KO and Sandler, MB},
  booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
  month = {May},
  pages = {3752--3756},
  title = {The Fifthnet Chroma Extractor},
  volume = {2020-May},
  year = {2020},
  abstract = {© 2020 IEEE. Deep Learning (DL) is commonly used in music processing tasks such as Automatic Chord Recognition (ACR), for which Convolutional Neural Networks (CNNs) are popular tools. Compression of CNNs has become a research topic of interest, focused on post-pruning of learnt networks and development of less expensive network elements-Ns assemble high level structure in data from small simple patterns. Music signals are often processed in the spectral domain where much known structure is present. We propose the FifthNet, a neural network for chroma-based ACR that incorporates known spectral structures in its design through data manipulation. We find that FifthNet is competitive with popular ACR networks while using only a small fraction of their network parameters.},
  doi = {10.1109/ICASSP40776.2020.9053714},
  isbn = {9781509066315},
  issn = {1520-6149},
  day = {1},
  publicationstatus = {published}
}
@article{metzig2020classificationtunes,
  author = {Metzig, C and Gould, M and Noronha, R and Abbey, R and Sandler, M and Colijn, C},
  journal = {Pattern Recognition Letters},
  month = {May},
  pages = {356--364},
  title = {Classification of origin with feature selection and network construction for folk tunes},
  volume = {133},
  year = {2020},
  abstract = {© 2020 Elsevier B.V. We address the question to what extent origin of folk songs can be predicted, and construct a song similarity network. For this we use feature selection and train a random forest classifier on extracted melody n-grams and rhythm grams of songs from 7 different groups of origin, 80 from each. We use its importances to reduce the feature space dimension for the construction of an informative network, which we visualized in a interactive web application. These tools have vast application in large-scale exploration in digitized music databases and for specific questions in musicology.},
  doi = {10.1016/j.patrec.2020.03.023},
  issn = {0167-8655},
  day = {1},
  publicationstatus = {published}
}
@article{kudumakis2020theblockchains,
  author = {KUDUMAKIS, P and WILMERING, T and Sandler, M and Rodríguez-Doncel, V and Boch, L and Delgado, J},
  journal = {IEEE: Signal Processing Magazine},
  month = {Feb},
  number = {2},
  pages = {89--95},
  publisher = {Institute of Electrical and Electronics Engineers},
  title = {The Challenge: From MPEG Intellectual Property Rights Ontologies to Smart Contracts and Blockchains},
  volume = {37},
  year = {2020},
  abstract = {The Moving Picture Experts Group (MPEG) is an International Organization for Standardization/International Electrotechnical Commission (ISO/IEC) working group that develops media coding standards. These standards include a set of ontologies for the codification of intellectual property rights (IPR) information related to media. The Media Value Chain Ontology (MVCO) facilitates rights tracking for fair, timely, and transparent payment of royalties by capturing user roles and their permissible actions on a particular IP entity. The Audio Value Chain Ontology (AVCO) extends MVCO functionality related to the description of IP entities in the audio domain, e.g., multitrack audio and time segments. The Media Contract Ontology (MCO) facilitates the conversion of narrative contracts to digital ones. Furthermore, the axioms in these ontologies can drive the execution of rights-related workflows in controlled environments, e.g., blockchains, where transparency and interoperability is favored toward fair trade of music and media. Thus, the aim of this article is to create awareness of the MPEG IPR ontologies developed in the last few years and the work currently taking place addressing the challenge identified toward the execution of such ontologies as smart contracts on blockchain environments.},
  doi = {10.1109/MSP.2019.2955207},
  issn = {1053-5888},
  day = {26},
  publicationstatus = {published}
}
@article{wilmering2020aeffects,
  author = {WILMERING, T and MOFFAT, DJ and Milo, A and Sandler, M},
  journal = {Applied Sciences},
  month = {Jan},
  number = {3},
  publisher = {MDPI AG},
  title = {A History of Audio Effects},
  volume = {10},
  year = {2020},
  abstract = {Audio effects are an essential tool that the field of music production relies upon. The ability to intentionally manipulate and modify a piece of sound has opened up considerable opportunities for music making. The evolution of technology has often driven new audio tools and effects, from early
architectural acoustics through electromechanical and electronic devices to the digitisation of music production studios. Throughout time, music has constantly borrowed ideas and technological advancements from all other fields and contributed back to the innovative technology. This is defined
as transsectorial innovation and fundamentally underpins the technological developments of audio effects. The development and evolution of audio effect technology is discussed, highlighting major technical breakthroughs and the impact of available audio effects.},
  doi = {10.3390/app10030791},
  issn = {2076-3417},
  day = {22},
  publicationstatus = {published}
}
@article{phan2020improvingenhancement,
  author = {Phan, H and McLoughlin, IV and Pham, L and Chen, OY and Koch, P and De Vos, M and Mertins, A},
  journal = {IEEE Signal Processing Letters},
  month = {Sep},
  pages = {1--1},
  publisher = {Institute of Electrical and Electronics Engineers},
  title = {Improving GANs for Speech Enhancement},
  year = {2020},
  doi = {10.1109/lsp.2020.3025020},
  issn = {1070-9908},
  eissn = {1558-2361},
  day = {21},
  publicationstatus = {published}
}
@article{phan2020towardslearning,
  author = {Phan, H and Chen, OY and Koch, P and Lu, Z and McLoughlin, I and Mertins, A and De Vos, M},
  journal = {IEEE Transactions on Biomedical Engineering},
  month = {Aug},
  publisher = {Institute of Electrical and Electronics Engineers},
  title = {Towards More Accurate Automatic Sleep Staging via Deep Transfer Learning.},
  url = {https://www.ncbi.nlm.nih.gov/pubmed/32866092},
  volume = {PP},
  year = {2020},
  abstract = {BACKGROUND: Despite recent significant progress in the development of automatic sleep staging methods, building a good model still remains a big challenge for sleep studies with a small cohort due to the data-variability and data-inefficiency issues. This work presents a deep transfer learning approach to overcome these issues and enable transferring knowledge from a large dataset to a small cohort for automatic sleep staging. METHODS: We start from a generic end-to-end deep learning framework for sequence-to-sequence sleep staging and derive two networks as the means for transfer learning. The networks are first trained in the source domain (i.e. the large database). The pretrained networks are then finetuned in the target domain (i.e. the small cohort) to complete knowledge transfer. We employ the Montreal Archive of Sleep Studies (MASS) database consisting of 200 subjects as the source domain and study deep transfer learning on three different target domains: the Sleep Cassette subset and the Sleep Telemetry subset of the Sleep-EDF Expanded database, and the Surrey-cEEGrid database. The target domains are purposely adopted to cover different degrees of data mismatch to the source domains. RESULTS: Our experimental results show significant performance improvement on automatic sleep staging on the target domains achieved with the proposed deep transfer learning approach. CONCLUSIONS: These results suggest the efficacy of the proposed approach in addressing the above-mentioned data-variability and data-inefficiency issues. SIGNIFICANCE: As a consequence, it would enable one to improve the quality of automatic sleep staging models when the amount of data is relatively small.},
  doi = {10.1109/TBME.2020.3020381},
  issn = {0018-9294},
  eissn = {1558-2531},
  language = {eng},
  day = {31},
  publicationstatus = {published}
}
@article{phan2020personalizedregularization,
  author = {Phan, H and Mikkelsen, K and Chén, OY and Koch, P and Mertins, A and Kidmose, P and De Vos, M},
  journal = {Physiological Measurement},
  month = {Jun},
  publisher = {IOP Publishing},
  title = {Personalized automatic sleep staging with single-night data: a pilot study with KL-divergence regularization},
  year = {2020},
  doi = {10.1088/1361-6579/ab921e},
  issn = {0967-3334},
  eissn = {1361-6579},
  day = {30},
  publicationstatus = {published}
}
@article{stockman2020exploringpriming,
  author = {Stockman, A and Feng, F},
  journal = {Journal on Multimodal User Interfaces},
  month = {Jul},
  publisher = {Springer Verlag},
  title = {Exploring crossmodal perceptual enhancement and integration in a sequence reproducing task with cognitive priming},
  year = {2020},
  doi = {10.1007/s12193-020-00326-y},
  issn = {1783-7677},
  day = {13},
  publicationstatus = {published}
}
@article{stockman2020theobjects,
  author = {Stockman, A and WILKIE, S},
  journal = {Applied Acoustics},
  month = {May},
  publisher = {Elsevier},
  title = {The effect of audio cues and sound source stimuli on the perception of approaching objects},
  year = {2020},
  doi = {10.1016/j.apacoust.2020.107388},
  issn = {0003-682X},
  day = {18},
  publicationstatus = {published}
}
@inproceedings{fields2020prefacepreface,
  author = {Fields, B and Stockman, T and Nickerson, LV and Healey, PGT},
  booktitle = {Proceedings of the 20th BCS HCI Group Conference: Engage, HCI 2006},
  month = {Jan},
  pages = {i},
  title = {Preface},
  year = {2020},
  day = {1},
  publicationstatus = {published}
}
@article{stowell2020auk,
  author = {Stowell, D and Kelly, J and Tanner, D and Taylor, J and Jones, E and Geddes, J and Chalstrey, E},
  journal = {Scientific Data},
  month = {Dec},
  number = {1},
  title = {A harmonised, high-coverage, open dataset of solar photovoltaic installations in the UK},
  volume = {7},
  year = {2020},
  abstract = {© 2020, The Author(s). Solar photovoltaic (PV) is an increasingly significant fraction of electricity generation. Efficient management, and innovations such as short-term forecasting and machine vision, demand high-resolution geographic datasets of PV installations. However, official and public sources have notable deficiencies: spatial imprecision, gaps in coverage and lack of crucial meta data, especially for small-scale solar panel installations. We present the results of a major crowd-sourcing campaign to create open geographic data for over 260,000 solar PV installations across the UK, covering an estimated 86\% of the capacity in the country. We focus in particular on capturing small-scale domestic solar PV, which accounts for a significant fraction of generation but was until now very poorly documented. Our dataset suggests nameplate capacities in the UK (as of September 2020) amount to a total of 10.66 GW explicitly mapped, or 13.93 GW when missing capacities are inferred. Our method is applied to the UK but applicable worldwide, and compatible with continual updating to track the rapid growth in PV deployment.},
  doi = {10.1038/s41597-020-00739-0},
  eissn = {2052-4463},
  day = {1},
  publicationstatus = {accepted}
}
@article{stowell2020ecoacousticsscale,
  author = {Stowell, D and Sueur, J},
  journal = {Remote Sensing in Ecology and Conservation},
  month = {Aug},
  number = {3},
  pages = {217--219},
  title = {Ecoacoustics: acoustic sensing for biodiversity monitoring at scale},
  volume = {6},
  year = {2020},
  doi = {10.1002/rse2.174},
  eissn = {2056-3485},
  day = {3},
  publicationstatus = {published}
}
@inproceedings{solomes2020efficientsystem,
  author = {Solomes, AM and Stowell, D},
  booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
  month = {May},
  pages = {746--750},
  title = {Efficient Bird Sound Detection on the Bela Embedded System},
  volume = {2020-May},
  year = {2020},
  abstract = {© 2020 IEEE. Monitoring wildlife is an important aspect of conservation initiatives. Deep learning detectors can help with this, although it is not yet clear whether they can run efficiently on an embedded system in the wild. This paper proposes an automatic detection algorithm for the Bela embedded Linux device for wildlife monitoring. The algorithm achieves good quality recognition, efficiently running on continuously streamed data on a commercially available platform. The program is capable of computing on-board detection using convolutional neural networks (CNNs) with an AUC score of 82.5\% on the testing set of an international data challenge. This paper details how the model is exported to work on the Bela Mini in C++, with the spectrogram generation and the implementation of the feed-forward network, and evaluates its performance on the Bird Audio Detection challenge 2018 DCASE data.},
  doi = {10.1109/ICASSP40776.2020.9053533},
  isbn = {9781509066315},
  issn = {1520-6149},
  day = {1},
  publicationstatus = {published}
}
@article{fanoyela2020onlinetree,
  author = {Fano Yela, D and Thalmann, F and Nicosia, V and Stowell, D and Sandler, M},
  journal = {Physical Review Research},
  month = {Apr},
  number = {2},
  publisher = {American Physical Society (APS)},
  title = {Online visibility graphs: Encoding visibility in a binary search tree},
  volume = {2},
  year = {2020},
  doi = {10.1103/physrevresearch.2.023069},
  eissn = {2643-1564},
  language = {en},
  day = {23},
  publicationstatus = {published}
}
@article{ycart2020learninglstms,
  author = {Ycart, A and Benetos, E},
  journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
  month = {Dec},
  number = {1},
  pages = {1328--1341},
  publisher = {Institute of Electrical and Electronics Engineers},
  title = {Learning and Evaluation Methodologies for Polyphonic Music Sequence Prediction with LSTMs},
  volume = {28},
  year = {2020},
  abstract = {Music language models (MLMs) play an important role for various music signal and symbolic music processing tasks, such as music generation, symbolic music classification, or automatic music transcription (AMT). In this paper, we investigate Long Short-Term Memory (LSTM) networks for polyphonic music prediction, in the form of binary piano rolls. A preliminary experiment, assessing the influence of the timestep of piano rolls on system performance, highlights the need for more musical evaluation metrics. We introduce a range of metrics, focusing on temporal and harmonic aspects. We propose to combine them into a parametrisable loss to train our network. We then conduct a range of experiments with this new loss, both for polyphonic music prediction (intrinsic evaluation) and using our predictive model as a language model for AMT (extrinsic evaluation). Intrinsic evaluation shows that tuning the behaviour of a model is possible by adjusting loss parameters, with consistent results across timesteps. Extrinsic evaluation shows consistent behaviour across timesteps in terms of precision and recall with respect to the loss parameters, leading to an improvement in AMT performance without changing the complexity of the model. In particular, we show that intrinsic performance (in terms of cross entropy) is not related to extrinsic performance, highlighting the importance of using custom training losses for each specific application. Our model also compares favourably with previously proposed MLMs.},
  doi = {10.1109/TASLP.2020.2987130},
  issn = {2329-9304},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{chettri2020subbandverification,
  author = {Chettri, B and Kinnunen, T and Benetos, E},
  booktitle = {http://www.odyssey2020.org/},
  month = {Nov},
  organization = {Tokyo, Japan},
  pages = {341--348},
  publisher = {ISCA},
  title = {Subband modeling for spoofing detection in automatic speaker verification},
  year = {2020},
  abstract = {Spectrograms - time-frequency representations of audio signals - have found widespread use in neural network-based spoofing detection. While deep models are trained on the fullband spectrum of the signal, we argue that not all frequency bands are useful for these tasks. In this paper, we systematically investigate the impact of different subbands and their importance on replay spoofing detection on two benchmark datasets: ASVspoof 2017 v2.0 and ASVspoof 2019 PA. We propose a joint subband modelling framework that employs n different sub-networks to learn subband specific features. These are later combined and passed to a classifier and the whole network weights are updated during training. Our findings on the ASVspoof 2017 dataset suggest that the most discriminative information appears to be in the first and the last 1 kHz frequency bands, and the joint model trained on these two subbands shows the best performance outperforming the baselines by a large margin. However, these findings do not generalise on the ASVspoof 2019 PA dataset. This suggests that the datasets available for training these models do not reflect real world replay conditions suggesting a need for careful design of datasets for training replay spoofing countermeasures.},
  doi = {10.21437/Odyssey.2020-48},
  startyear = {2020},
  startmonth = {Nov},
  startday = {1},
  finishyear = {2020},
  finishmonth = {Nov},
  finishday = {5},
  conference = {Odyssey 2020: The Speaker and Language Recognition Workshop},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{pankajakshan2020memoryrecognition,
  author = {Pankajakshan, A and Bear, H and Subramanian, V and Benetos, E},
  booktitle = {},
  month = {Oct},
  organization = {Shanghai, China},
  publisher = {International Speech and Communication Association (ISCA)},
  title = {Memory Controlled Sequential Self Attention for Sound Recognition},
  year = {2020},
  abstract = {In this paper we investigate the importance of the extent of memory in sequential self attention for sound recognition. We propose to use a memory controlled sequential self attention mechanism on top of a convolutional recurrent neural network (CRNN) model for polyphonic sound event detection (SED). Experiments on the URBAN-SED dataset demonstrate the impact of the extent of memory on sound recognition performance with the self attention induced SED model. We extend the proposed idea with a multi-head self attention mechanism where each attention head processes the audio embedding with explicit attention width values. The proposed use of memory controlled sequential self attention offers a way to induce relations among frames of sound event tokens. We show that our memory controlled self attention model achieves an event based F -score of 33.92\% on the URBAN-SED dataset, outperforming the F -score of 20.10\% reported by the model without self attention. Index Terms: Memory controlled self attention, sound recognition, multi-head attention.},
  startyear = {2020},
  startmonth = {Oct},
  startday = {25},
  finishyear = {2020},
  finishmonth = {Oct},
  finishday = {29},
  conference = {21st Annual Conference of the International Speech Communication Association (INTERSPEECH 2020)},
  day = {25},
  publicationstatus = {accepted}
}
@inproceedings{ragano2020developmentconditions,
  author = {Ragano, A and Benetos, E and Hines, A},
  booktitle = {},
  month = {Oct},
  organization = {Shanghai, China},
  title = {Development of a Speech Quality Database Under Uncontrolled Conditions},
  year = {2020},
  abstract = {Objective audio quality assessment is preferred to avoid time-consuming and costly listening tests. The development of objective quality metrics depends on the availability of datasets appropriate to the application under study. Currently, a suitable human-annotated dataset for developing quality metrics in archive audio is missing. Given the online availability of archival recordings, we propose to develop a real-world audio quality dataset. We present a methodology used to curate a speech quality database using the archive recordings from the Apollo Space Program. The proposed procedure is based on two steps: a pilot listening test and an exploratory data analysis. The pilot listening test shows that we can extract audio clips through the control of speech-to-text performance metrics to prevent data repetition. Through unsupervised exploratory data analysis, we explore the characteristics of the degradations. We classify distinct degradations and we study spectral, intensity, tonality and overall quality properties of the data through clustering techniques. These results provide the necessary foundation to support the subsequent development of large-scale crowdsourced datasets for audio quality.},
  startyear = {2020},
  startmonth = {Oct},
  startday = {25},
  finishyear = {2020},
  finishmonth = {Oct},
  finishday = {29},
  conference = {21st Annual Conference of the International Speech Communication Association (INTERSPEECH 2020)},
  day = {25},
  publicationstatus = {accepted}
}
@article{chettri2020deepverification,
  author = {Chettri, B and Kinnunen, T and Benetos, E},
  journal = {Computer Speech and Language},
  month = {Sep},
  number = {101092},
  publisher = {Elsevier},
  title = {Deep Generative Variational Autoencoding for Replay Spoof Detection in Automatic Speaker Verification},
  volume = {63},
  year = {2020},
  abstract = {Automatic speaker verification (ASV) systems are highly vulnerable to presentation attacks, also called spoofing attacks. Replay is among the simplest attacks to mount - yet difficult to detect reliably. The generalization failure of spoofing countermeasures (CMs) has driven the community to study various alternative deep learning CMs. The majority of them are supervised approaches that learn a human-spoof discriminator. In this paper, we advocate a different, deep generative approach that leverages from powerful unsupervised manifold learning in classification. The potential benefits include the possibility to sample new data, and to obtain insights to the latent features of genuine and spoofed speech. To this end, we propose to use variational autoencoders (VAEs) as an alternative backend for replay attack detection, via three alternative models that differ in their class-conditioning. The first one, similar to the use of Gaussian mixture models (GMMs) in spoof detection, is to train independently two VAEs - one for each class. The second one is to train a single conditional model (C-VAE) by injecting a one-hot class label vector to the encoder and decoder networks. Our final proposal integrates an auxiliary classifier to guide the learning of the latent space. Our experimental results using constant-Q cepstral coefficient (CQCC) features on the ASVspoof 2017 and 2019 physical access subtask datasets indicate that the C-VAE offers substantial improvement in comparison to training two separate VAEs for each class. On the 2019 dataset, the C-VAE outperforms the VAE and the baseline GMM by an absolute 9-10\% in both equal error rate (EER) and tandem detection cost function (t-DCF) metrics. Finally, we propose VAE residuals --- the absolute difference of the original input and the reconstruction as features for spoofing detection. The proposed frontend approach augmented with a convolutional neural network classifier demonstrated substantial improvement over the VAE backend use case.},
  doi = {10.1016/j.csl.2020.101092},
  issn = {0885-2308},
  day = {1},
  publicationstatus = {accepted}
}
@inproceedings{mishra2020reliablelistening,
  author = {MISHRA, S and Benetos, E and Sturm, B and Dixon, S},
  booktitle = {},
  month = {Jul},
  organization = {Glasgow, UK},
  publisher = {IEEE},
  title = {Reliable Local Explanations for Machine Listening},
  url = {https://wcci2020.org/},
  year = {2020},
  abstract = {One way to analyse the behaviour of machine learning models is through local explanations that highlight input features that maximally influence model predictions. Sensitivity analysis, which involves analysing the effect of input perturbations on model predictions, is one of the methods to generate local explanations. Meaningful input perturbations are essential for generating reliable explanations, but there exists limited work on what such perturbations are and how to perform them. This work investigates these questions in the context of machine listening models that analyse audio. Specifically, we use a state-of-the-art deep singing voice detection (SVD) model to analyse whether explanations from SoundLIME (a local explanation method) are sensitive to how the method perturbs model inputs. The results demonstrate that SoundLIME explanations are sensitive to the content in the occluded input regions. We further propose and demonstrate a novel method for quantitatively identifying suitable content type(s) for reliably occluding inputs of machine listening models. The results for the SVD model suggest that the average magnitude of input mel-spectrogram bins is the most suitable content type for temporal explanations.},
  startyear = {2020},
  startmonth = {Jul},
  startday = {19},
  finishyear = {2020},
  finishmonth = {Jul},
  finishday = {24},
  conference = {International Joint Conference on Neural Networks (IJCNN)},
  day = {19},
  publicationstatus = {accepted}
}
@inproceedings{ragano2020audiorepresentation,
  author = {Ragano, A and Benetos, E and Hines, A},
  booktitle = {http://qomex2020.ie/},
  month = {May},
  organization = {Athlone, Ireland},
  publisher = {IEEE},
  title = {Audio impairment recognition using a correlation-based feature representation},
  year = {2020},
  abstract = {Audio impairment recognition is based on finding noise in audio files and categorising the impairment type. Recently, significant performance improvement has been obtained thanks to the usage of advanced deep learning models. However, feature robustness is still an unresolved issue and it is one of the main reasons why we need powerful deep learning architectures. In the presence of a variety of musical styles, hand-crafted features are less efficient in capturing audio degradation characteristics and they are prone to failure when recognising audio impairments and could mistakenly learn musical concepts rather than impairment types. In this paper, we propose a new representation of hand-crafted features that is based on the correlation of feature pairs. We experimentally compare the proposed correlation-based feature representation with a typical raw feature representation used in machine learning and we show superior performance in terms of compact feature dimensionality and improved computational speed in the test stage whilst achieving comparable accuracy.},
  startyear = {2020},
  startmonth = {May},
  startday = {26},
  finishyear = {2020},
  finishmonth = {May},
  finishday = {28},
  conference = {12th International Conference on Quality of Multimedia Experience (QoMEX)},
  day = {26},
  publicationstatus = {accepted}
}
@inproceedings{wang2020playingscattering,
  author = {Wang, C and Lostanlen, V and Benetos, E and Chew, E},
  booktitle = {},
  month = {May},
  organization = {Barcelona, Spain},
  pages = {881--885},
  title = {Playing Technique Recognition by Joint Time–Frequency Scattering},
  year = {2020},
  abstract = {Playing techniques are important expressive elements in music signals. In this paper, we propose a recognition system based on the joint time–frequency scattering transform (jTFST) for pitch evolution-based playing techniques (PETs), a group of playing techniques with monotonic pitch changes over time. The jTFST represents spectro-temporal patterns in the time–frequency domain, capturing discriminative information of PETs. As a case study, we analyse three commonly used PETs of the Chinese bamboo flute: acciacatura, portamento, and glissando, and encode their characteristics using the jTFST. To verify the proposed approach, we create a new dataset, the CBF-petsDB, containing PETs played in isolation as well as in the context of whole pieces performed and annotated by professional players. Feeding the jTFST to a machine learning classifier, we obtain F-measures of 71\% for acciacatura, 59\% for portamento, and 83\% for glissando detection, and provide explanatory visualisations of scattering coefficients for each technique.},
  doi = {10.1109/ICASSP40776.2020.9053474},
  startyear = {2020},
  startmonth = {May},
  startday = {4},
  finishyear = {2020},
  finishmonth = {May},
  finishday = {8},
  keyword = {Music signal analysis},
  keyword = {Scattering transform},
  keyword = {Performance analysis},
  keyword = {Playing technique recognition},
  conference = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2020)},
  day = {4},
  publicationstatus = {published}
}
@inproceedings{wei2020acrnndetection,
  author = {Wei, W and Zhu, H and Benetos, E and Wang, Y},
  booktitle = {},
  month = {May},
  organization = {Barcelona, Spain},
  pages = {276--280},
  publisher = {IEEE},
  title = {A-CRNN: a domain adaptation model for sound event detection},
  url = {https://2020.ieeeicassp.org/},
  year = {2020},
  abstract = {This paper presents a domain adaptation model for sound event detection. A common challenge for sound event detection is how to deal with the mismatch among different datasets. Typically, the performance of a model will decrease if it is tested on a dataset which is different from the one that the model is trained on. To address this problem, based on convolutional recurrent neural networks (CRNNs), we propose an adapted CRNN (A-CRNN) as an unsupervised adversarial domain adaptation model for sound event detection. We have collected and annotated a dataset in Singapore with two types of recording devices to complement existing datasets in the research community, especially with respect to domain adaptation. We perform experiments on recordings from different datasets and from different recordings devices. Our experimental results show that the proposed A-CRNN model can achieve a better performance on an unseen dataset in comparison with the baseline non-adapted CRNN model.},
  doi = {10.1109/ICASSP40776.2020.9054248},
  startyear = {2020},
  startmonth = {May},
  startday = {4},
  finishyear = {2020},
  finishmonth = {May},
  finishday = {8},
  conference = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2020)},
  day = {4},
  publicationstatus = {published}
}
@inproceedings{subramanian2020aclassification,
  author = {SUBRAMANIAN, V and Pankajakshan, A and Benetos, E and Xu, N and McDonald, S and Sandler, M},
  booktitle = {},
  month = {May},
  organization = {Barcelona, Spain},
  pages = {301--305},
  publisher = {IEEE},
  title = {A Study on the Transferability of Adversarial Attacks in Sound Event Classification},
  url = {https://2020.ieeeicassp.org/},
  year = {2020},
  abstract = {An adversarial attack is an algorithm that perturbs the input of a machine learning model in an intelligent way in order to change the output of the model. An important property of adversarial attacks is transferability. According to this property, it is possible to generate adversarial perturbations on one model and apply it the input to fool the output of a different model. Our work focuses on studying the transferability of adversarial attacks in sound event classification. We are able to demonstrate differences in transferability properties from those observed in computer vision. We show that dataset normalization techniques such as z-score normalization does not affect the transferability of adversarial attacks and we show that techniques such as knowledge distillation do not increase the transferability of attacks.},
  doi = {10.1109/ICASSP40776.2020.9054445},
  startyear = {2020},
  startmonth = {May},
  startday = {4},
  finishyear = {2020},
  finishmonth = {May},
  finishday = {8},
  conference = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2020)},
  day = {4},
  publicationstatus = {published}
}
@inproceedings{kim2020workinggames,
  author = {Kim, R and Thomas, S and Van Dierendonck, R and Bryan-Kinns, N and Poslad, S},
  booktitle = {ACM International Conference Proceeding Series},
  month = {Sep},
  title = {Working with Nature's Lag: Initial Design Lessons for Slow Biotic Games},
  year = {2020},
  abstract = {© 2020 ACM. One of the most fundamental features of living organisms is their growth, a biological phenomenon that can be considered as a type of slow, tangible output responding to an environmental stimulus or an input. Given the relative slowness of growth, once it becomes part of game mechanics, the feature can lead to slow interactivity and slow gameplay in biotic games-a relatively new type of bio-digital game that enables playful human-microbe interactions. Currently, there is a lack of annotations on existing biotic game design guidelines, that 1) recognise biological slowness as a potentially beneficial feature in game design, and 2) provide specific advice on how organism's slow response time can be effectively incorporated in biotic games. To start addressing these limitations, we report on an initial set of design lessons learnt from our research on slow biotic games. Through these lessons, we have formulated and outlined a set of practical recommendations for prospective designers of slow biotic games.},
  doi = {10.1145/3402942.3409790},
  isbn = {9781450388078},
  day = {15},
  publicationstatus = {published}
}
@inproceedings{soave2020areality,
  author = {Soave, F and Bryan-Kinns, N and Farkhatdinov, I},
  booktitle = {},
  month = {Aug},
  title = {A Preliminary Study on Full-Body Haptic Stimulation on Modulating Self-Motion Perception in Virtual Reality},
  year = {2020},
  conference = {Salento AVR 2020},
  day = {31},
  publicationstatus = {published}
}
@inproceedings{Kim2020,
  author = {Kim, R and Thomas, S and Dierendonck, RV and Bryan-Kinns, N and Poslad, S},
  title = {Working with Nature's Lag: Initial Design Lessons for Slow Biotic Games.},
  booktitle = {FDG},
  year = {2020},
  editor = {Yannakakis, GN and Liapis, A and Kyburz, P and Volz, V and Khosmood, F and Lopes, P},
  pages = {29:1--29:1},
  publisher = {ACM},
  isbn = {978-1-4503-8807-8},
  url = {https://doi.org/10.1145/3402942}
}
@incollection{edlin2020exploringmaterial,
  author = {Edlin, L and Liu, Y and Bryan-Kinns, N and Reiss, J},
  booktitle = {},
  month = {Jan},
  pages = {54--69},
  title = {Exploring Augmented Reality as Craft Material},
  volume = {12428 LNCS},
  year = {2020},
  abstract = {© 2020, Springer Nature Switzerland AG. Craft making is associated with tradition, cultural preservation, and skilled hand-making techniques. While there are examples of digital craft making analyses in the literature, Augmented Reality (AR) applied to craft making practice has not been explored, yet applying AR to craft making practices could bring insight into methods of combining virtual and physical materials. This paper investigates how AR is considered by craft makers. We find that narrative is essentially physically located in craft objects, and while virtual elements may describe and annotate an artefact, it is not considered part of the craft artefact’s narrative.},
  doi = {10.1007/978-3-030-59990-4_5},
  isbn = {9783030599898},
  issn = {0302-9743},
  eissn = {1611-3349},
  day = {1},
  publicationstatus = {published}
}
@article{wang2020oncultures,
  author = {Wang, W and Bryan-Kinns, N and Sheridan, JG},
  journal = {CoDesign},
  month = {Jul},
  number = {3},
  pages = {233--250},
  title = {On the role of in-situ making and evaluation in designing across cultures},
  volume = {16},
  year = {2020},
  abstract = {© 2019 Informa UK Limited, trading as Taylor  \&  Francis Group. There is a growing interest in designing products and interactions across cultures. In this paper, we report on our attempts to use in-situ making and evaluation to facilitate a short co-design process with outside designers in an ethnic and rural community. We found that rapid prototyping in the local context provided a mechanism to quickly engage designers with locals in informing iterative design refinement. Our research suggests that using in-situ making interlaced with evaluation is a feasible approach to drive designers to immerse, exchange and design within a cultural different context in the early stage of design exploration. We found that the rapid nature of our process makes it more suited for cultural product design led by designers than cross-cultural design.},
  doi = {10.1080/15710882.2019.1580296},
  issn = {1571-0882},
  eissn = {1745-3755},
  day = {2},
  publicationstatus = {published}
}
@inproceedings{proutskova2020fromethnomusicontology,
  author = {Proutskova, P and Volk, A and Heidarian, P and Fazekas, G},
  booktitle = {https://www.ismir2020.net/assets/img/proceedings/2020_ISMIR_Proceedings.pdf},
  month = {Oct},
  organization = {Montreal, Canada},
  pages = {923--931},
  title = {From Music Ontology Towards Ethno-Music-Ontology},
  url = {http://semantiaudio.net/},
  url = {https://program.ismir2020.net/static/final_papers/323.pdf},
  year = {2020},
  abstract = {This paper presents exploratory work investigating the suitability of the Music Ontology - the most widely used formal specification of the music domain - for modelling non-Western musical traditions. Four contrasting case studies from a variety of musical cultures are analysed: Dutch folk song research, reconstructive performance of rural Russian traditions, contemporary performance and composition of Persian classical music, and recreational use of a personal world music collection. We propose semantic models describing the respective do- mains and examine the applications of the Music Ontology for these case studies: which concepts can be successfully reused, where they need adjustments, and which parts of the reality in these case studies are not covered by the Mu- sic Ontology. The variety of traditions, contexts and modelling goals covered by our case studies sheds light on the generality of the Music Ontology and on the limits of generalisation “for all musics” that could be aspired for on the Semantic Web.},
  startyear = {2020},
  startmonth = {Oct},
  startday = {11},
  finishyear = {2020},
  finishmonth = {Oct},
  finishday = {16},
  isbn = {978-0-9813537-0-8},
  keyword = {Music Information Retrieval},
  keyword = {Music Ontology},
  keyword = {Ethno Musicology},
  conference = {International Society for Music Information Retrieval Conference},
  day = {11},
  publicationstatus = {published}
}
@article{williams2020onmusic,
  author = {Williams, D and Fazenda, B and Williamson, V and Fazekas, G},
  journal = {Sensors (Basel, Switzerland)},
  month = {Aug},
  number = {16},
  pages = {1--14},
  publisher = {MDPI AG},
  title = {On performance and perceived effort in trail runners using sensor control to generate biosynchronous music},
  volume = {20},
  year = {2020},
  abstract = {Music has been shown to be capable of improving runners’ performance in treadmill and laboratory-based experiments. This paper evaluates a generative music system, namely HEARTBEATS, designed to create biosignal synchronous music in real-time according to an individual athlete’s heartrate or cadence (steps per minute). The tempo, melody, and timbral features of the generated music are modulated according to biosensor input from each runner using a combination of PPG (Photoplethysmography) and GPS (Global Positioning System) from a wearable sensor, synchronized via Bluetooth. We compare the relative performance of athletes listening to music with heartrate and cadence synchronous tempos, across a randomized trial (N = 54) on a trail course with 76 ft of elevation. Participants were instructed to continue until their self-reported perceived effort went beyond an 18 using the Borg rating of perceived exertion. We found that cadence-synchronous music improved performance and decreased perceived effort in male runners. For female runners, cadence synchronous music improved performance but it was heartrate synchronous music which significantly reduced perceived effort and allowed them to run the longest of all groups tested. This work has implications for the future design and implementation of novel portable music systems and in music-assisted coaching.},
  doi = {10.3390/s20164528},
  issn = {1424-8220},
  day = {13},
  publicationstatus = {published}
}
@inproceedings{thompson2020posterdevelopers,
  author = {Thompson, A and Fazekas, G and Wiggins, G},
  booktitle = {Proceedings of IEEE Symposium on Visual Languages and Human-Centric Computing, VL/HCC},
  month = {Aug},
  title = {Poster: Programming Practices among Interactive Audio Software Developers},
  volume = {2020-August},
  year = {2020},
  abstract = {© 2020 IEEE. New domain-specific languages for creating music and audio applications have typically been created in response to some technological challenge. Recent research has begun looking at how these languages impact our creative and aesthetic choices in music-making but we have little understanding on their effect on our wider programming practice. We present a survey that seeks to uncover what programming practices exist among interactive audio software developers and discover it is highly multi-practice, with developers adopting both exploratory programming and software engineering practice. A Q methodological study reveals that this multi-practice development is supported by different combinations of language features.},
  doi = {10.1109/VL/HCC50065.2020.9127261},
  isbn = {9781728169019},
  issn = {1943-6092},
  eissn = {1943-6106},
  day = {1},
  publicationstatus = {published}
}
@article{turchet2020cloudsmartinteractions,
  author = {Turchet, L and Pauwels, J and Fischione, C and Fazekas, G},
  journal = {ACM Transactions on Internet of Things},
  month = {Jul},
  number = {3},
  pages = {1--29},
  title = {Cloud-smart Musical Instrument Interactions},
  volume = {1},
  year = {2020},
  doi = {10.1145/3377881},
  issn = {2577-6207},
  day = {14},
  publicationstatus = {published}
}
@article{turchet2020thechallenges,
  author = {Turchet, L and Fazekas, G and Lagrange, M and Ghadikolaei, HS and Fischione, C},
  journal = {IEEE Internet of Things Journal},
  month = {May},
  number = {10},
  pages = {10233--10249},
  title = {The Internet of Audio Things: State of the Art, Vision, and Challenges},
  volume = {7},
  year = {2020},
  doi = {10.1109/jiot.2020.2997047},
  issn = {2327-4662},
  day = {25},
  publicationstatus = {published}
}
@inproceedings{shatri2020opticalchallenges,
  author = {Shatri, E and Fazekas, G},
  booktitle = {},
  month = {May},
  organization = {Hamburg},
  title = {Optical Music Recognition: State of the Art and Major Challenges},
  url = {https://www.elonashatri.co.uk/},
  year = {2020},
  abstract = {Optical Music Recognition (OMR) is concerned with transcribing sheet music into a machine-readable format. The transcribed copy should allow musicians to compose, play and edit music by taking a picture of a music sheet. Complete transcription of sheet music would also enable more efficient archival. OMR facilitates examining sheet music statistically or searching for patterns of notations, thus helping use cases in digital musicology too. Recently, there has been a shift in OMR from using conventional computer vision techniques towards a deep learning approach. In this paper, we review relevant works in OMR, including fundamental methods and significant outcomes, and highlight different stages of the OMR pipeline. These stages often lack standard input and output representation and standardised evaluation. Therefore, comparing different approaches and evaluating the impact of different processing methods can become rather complex. This paper provides recommendations for future work, addressing some of the highlighted issues and represents a position in furthering this important field of research.},
  startyear = {2020},
  startmonth = {May},
  startday = {12},
  finishyear = {2020},
  finishmonth = {May},
  finishday = {14},
  conference = {International Conference on Technologies for Music Notation and Representation},
  day = {12},
  publicationstatus = {accepted}
}
@article{turchet2020theontology,
  author = {Turchet, L and Antoniazzi, F and Viola, F and Giunchiglia, F and Fazekas, G},
  journal = {Journal of Web Semantics},
  month = {Jan},
  title = {The Internet of Musical Things Ontology},
  volume = {60},
  year = {2020},
  abstract = {© 2020 Elsevier B.V. The Internet of Musical Things (IoMusT) is an emerging research area consisting of the extension of the Internet of Things paradigm to the music domain. Interoperability represents a central issue within this domain, where heterogeneous objects dedicated to the production and/or reception of musical content (Musical Things) are envisioned to communicate between each other. This paper proposes an ontology for the representation of the knowledge related to IoMusT ecosystems to facilitate interoperability between Musical Things. There was no previous comprehensive data model for the IoMusT domain, however the new ontology relates to existing ontologies, including the SOSA Ontology for the representation of sensors and actuators and the Music Ontology focusing on the production and consumption of music. This paper documents the design of the ontology and its evaluation with respect to specific requirements gathered from an extensive literature review, which was based on scenarios involving IoMusT stakeholders, such as performers and audience members. The IoMusT Ontology can be accessed at: https://w3id.org/iomust#.},
  doi = {10.1016/j.websem.2020.100548},
  issn = {1570-8268},
  day = {22},
  publicationstatus = {published}
}
@inproceedings{light2020designingnotquiteyet,
  author = {Light, A and Healey, PGT and Simpson, G},
  booktitle = {Proceedings of the 20th BCS HCI Group Conference: Engage, HCI 2006},
  month = {Jan},
  pages = {282--283},
  title = {Designing the not-quite-yet},
  year = {2020},
  abstract = {© BCS HCI Group Conference: Engage, HCI 2006.All right reserved. As digital technologies become more complex and further penetrate domestic, educational, social and political areas of life, we ask how people are to make sense of the choices available to them. Ideally, everyone should participate in the design decisions that will impact on their lives, based on a good understanding of the potential of digital networks and the implications of using them. But the benefits and shortcomings of technologies are notoriously difficult to anticipate before implementation, just as their uses are. This workshop is for those interested in empowering the public to contribute to design effectively. We will gather and share ideas and success stories in the company of artists working in this field. The goal is a wider franchise for design… and better outcomes.},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{moro2020ageneration,
  author = {Moro, G and Mcpherson, A},
  booktitle = {nime.org/archives},
  month = {Jul},
  organization = {Royal Birmingham Conservatoire},
  title = {A platform for low-latency continuous keyboard sensing and sound generation},
  year = {2020},
  abstract = {On several acoustic and electromechanical keyboard instruments, the produced sound is not always strictly dependent exclusively on a discrete key velocity parameter, and minute gesture details can affect the final sonic result. By contrast, subtle variations in articulation have a relatively limited effect on the sound generation when the keyboard controller uses the MIDI standard, used in the vast majority of digital keyboards. In this paper we present an embedded platform that can generate sound in response to a controller capable of sensing the continuous position of keys on a keyboard. This platform enables the creation of keyboard-based DMIs which allow for a richer set of interaction gestures than would be possible through a MIDI keyboard, which we demonstrate through two example instruments. First, in a Hammond organ emulator, the sensing device allows to recreate the nuances of the interaction with the original instrument in a way a velocity-based MIDI controller could not. Second, a nonlinear waveguide flute synthesizer is shown as an example of the expressive capabilities that a continuous-keyboard controller opens up in the creation of new keyboard-based DMIs.},
  startyear = {2020},
  startmonth = {Jul},
  startday = {21},
  finishyear = {2020},
  finishmonth = {Jul},
  finishday = {25},
  issn = {2220-4806},
  conference = {International Conference on New Interfaces for Musical Expression},
  day = {20},
  publicationstatus = {accepted}
}
@inproceedings{lepri2020uselesspractice,
  author = {Lepri, G and Mcpherson, A and Bowers, J},
  booktitle = {DIS 2020 - Proceedings of the 2020 ACM Designing Interactive Systems Conference},
  month = {Jul},
  pages = {1887--1899},
  title = {Useless, not Worthless: Absurd making as critical practice},
  year = {2020},
  abstract = {© 2020 ACM. We report on the outcomes of a hackathon organised around the themes of absurd musical interfaces, questionable sonic interactions and unworkable music designs. At the core of the project is the intention to explore absurd making as a way to support critical and disruptive design practices. We reflect on how surreal, nonsensical and fragile artefacts can be helpful to stretch and critique conventional ideas of what is useful and appropriate in technology research and development. After introducing both concepts and methods that shaped the event we present a selection of useless interfaces designed by the hackathon's attendees. These musical artefacts, and the considerations around them, are then discussed as a viable means for communicating both design concerns and future visions. We also consider two features identified as playing a crucial role within the event: the discovery of contradictions and the importance of context-based ingredients.},
  doi = {10.1145/3357236.3395547},
  isbn = {9781450369749},
  day = {3},
  publicationstatus = {published}
}
@inproceedings{mcpherson2020beholdeninstruments,
  author = {Mcpherson, A and Lepri, G},
  booktitle = {},
  month = {Jun},
  title = {Beholden to Our Tools: Negotiating with Technology while Sketching Digital Instruments},
  year = {2020},
  abstract = {Digital musical instrument design is often presented as an
open-ended creative process in which technology is adopted
and adapted to serve the musical will of the designer. The
real-time music programming languages powering many new
instruments often provide access to audio manipulation at
a low level, theoretically allowing the creation of any sonic
structure from primitive operations. As a result, designers
may assume that these seemingly omnipotent tools are pliable
vehicles for the expression of musical ideas. We present
the outcomes of a compositional game in which sound designers
were invited to create simple instruments using common
sensors and the Pure Data programming language. We
report on the patterns and structures that often emerged
during the exercise, arguing that designers respond strongly
to suggestions o ered by the tools they use. We discuss the
idea that current music programming languages may be as
culturally loaded as the communities of practice that produce
and use them. Instrument making is then best viewed
as a protracted negotiation between designer and tools.},
  conference = {New Interfaces For Musical Expression},
  day = {3},
  publicationstatus = {accepted}
}
@article{zioga2020auditorystyle,
  author = {Zioga, I and Harrison, PMC and Pearce, MT and Bhattacharya, J and Luft, CDB},
  journal = {Journal of Cognitive Neuroscience},
  month = {Jan},
  number = {12},
  pages = {2241--2259},
  title = {Auditory but not audiovisual cues lead to higher neural sensitivity to the statistical regularities of an unfamiliar musical style},
  volume = {32},
  year = {2020},
  abstract = {© 2020 Massachusetts Institute of Technology. It is still a matter of debate whether visual aids improve learning of music. In a multisession study, we investigated the neural signatures of novel music sequence learning with or without aids (auditory-only: AO, audiovisual: AV). During three training sessions on three separate days, participants (nonmu-sicians) reproduced (note by note on a keyboard) melodic sequences generated by an artificial musical grammar. The AV group (n = 20) had each note color-coded on screen, whereas the AO group (n = 20) had no color indication. We evaluated learning of the statistical regularities of the novel music grammar before and after training by presenting melodies ending on correct or incorrect notes and by asking participants to judge the correctness and surprisal of the final note, while EEG was recorded. We found that participants successfully learned the new grammar. Although the AV group, as compared to the AO group, reproduced longer sequences during training, there was no significant difference in learning between groups. At the neural level, after training, the AO group showed a larger N100 response to low-probability compared with high-probability notes, suggesting an increased neural sensitivity to statistical properties of the grammar; this effect was not observed in the AV group. Our findings indicate that visual aids might improve sequence reproduction while not necessarily promoting better learning, indicating a potential dissociation between sequence reproduction and learning. We suggest that the difficulty induced by auditory-only input during music training might enhance cognitive engagement, thereby improving neural sensitivity to the underlying statistical properties of the learned material.},
  doi = {10.1162/jocn_a_01614},
  issn = {0898-929X},
  eissn = {1530-8898},
  day = {1},
  publicationstatus = {published}
}
@article{ycart2020investigatingtranscription,
  author = {Ycart, A and Liu, L and Benetos, E and Pearce, M},
  journal = {Transactions of the International Society for Music Information Retrieval},
  month = {Jun},
  number = {1},
  pages = {68--81},
  publisher = {Ubiquity Press},
  title = {Investigating the Perceptual Validity of Evaluation Metrics for Automatic Piano Music Transcription},
  volume = {3},
  year = {2020},
  abstract = {Automatic Music Transcription (AMT) is usually evaluated using low-level criteria, typically by counting the numbers of errors, with equal weighting. Yet, some errors (e.g. out-of-key notes) are more salient than others. In this study, we design an online listening test to gather judgements about AMT quality. These judgements take the form of pairwise comparisons of transcriptions of the same music by pairs of different AMT systems. We investigate how these judgements correlate with benchmark metrics, and find that although they match in many cases, agreement drops when comparing pairs with similar scores, or pairs of poor transcriptions. We show that onset-only notewise F-measure is the benchmark metric that correlates best with human judgement, all the more so with higher onset tolerance thresholds. We define a set of features related to various musical attributes, and use them to design a new metric that correlates significantly better with listeners' quality judgements. We examine which musical aspects were important to raters by conducting an ablation study on the defined metric, highlighting the importance of the rhythmic dimension (tempo, meter). We make the collected data entirely available for further study, in particular to evaluate the perceptual relevance of new AMT metrics.},
  doi = {10.5334/tismir.57},
  issn = {2514-3298},
  day = {12},
  publicationstatus = {published}
}
@article{bianco2020longtermhumans,
  author = {Bianco, R and Harrison, PMC and Hu, M and Bolger, C and Picken, S and Pearce, MT and Chait, M},
  journal = {eLife},
  month = {May},
  pages = {1--6},
  title = {Long-term implicit memory for sequential auditory patterns in humans},
  volume = {9},
  year = {2020},
  abstract = {© 2020, eLife Sciences Publications Ltd. All rights reserved. Memory, on multiple timescales, is critical to our ability to discover the structure of our surroundings, and efficiently interact with the environment. We combined behavioural manipulation and modelling to investigate the dynamics of memory formation for rarely reoccurring acoustic patterns. In a series of experiments, participants detected the emergence of regularly repeating patterns within rapid tone-pip sequences. Unbeknownst to them, a few patterns reoccurred every ~3 minutes. All sequences consisted of the same 20 frequencies and were distinguishable only by the order of tone-pips. Despite this, reoccurring patterns were associated with a rapidly growing detection-time advantage over novel patterns. This effect was implicit, robust to interference, and persisted up to 7 weeks. The results implicate an interplay between short (a few seconds) and long-term (over many minutes) integration in memory formation and demonstrate the remarkable sensitivity of the human auditory system to sporadically reoccurring structure within the acoustic environment.},
  doi = {10.7554/eLife.56073},
  eissn = {2050-084X},
  day = {1},
  publicationstatus = {published}
}
@article{quirogamartinez2020decomposingsystem,
  author = {Quiroga-Martinez, DR and Hansen, NC and Højlund, A and Pearce, M and Brattico, E and Vuust, P},
  journal = {NeuroImage},
  month = {Apr},
  title = {Decomposing neural responses to melodic surprise in musicians and non-musicians: Evidence for a hierarchy of predictions in the auditory system},
  volume = {215},
  year = {2020},
  abstract = {© 2020 The Author(s) Neural responses to auditory surprise are typically studied with highly unexpected, disruptive sounds. Consequently, little is known about auditory prediction in everyday contexts that are characterized by fine-grained, non-disruptive fluctuations of auditory surprise. To address this issue, we used IDyOM, a computational model of auditory expectation, to obtain continuous surprise estimates for a set of newly composed melodies. Our main goal was to assess whether the neural correlates of non-disruptive surprising sounds in a musical context are affected by musical expertise. Using magnetoencephalography (MEG), auditory responses were recorded from musicians and non-musicians while they listened to the melodies. Consistent with a previous study, the amplitude of the N1m component increased with higher levels of computationally estimated surprise. This effect, however, was not different between the two groups. Further analyses offered an explanation for this finding: Pitch interval size itself, rather than probabilistic prediction, was responsible for the modulation of the N1m, thus pointing to low-level sensory adaptation as the underlying mechanism. In turn, the formation of auditory regularities and proper probabilistic prediction were reflected in later components: The mismatch negativity (MMNm) and the P3am, respectively. Overall, our findings reveal a hierarchy of expectations in the auditory system and highlight the need to properly account for sensory adaptation in research addressing statistical learning.},
  doi = {10.1016/j.neuroimage.2020.116816},
  issn = {1053-8119},
  eissn = {1095-9572},
  day = {8},
  publicationstatus = {accepted}
}
@article{clemente2020aassessments,
  author = {Clemente, A and Vila-Vidal, M and Pearce, MT and Aguiló, G and Corradi, G and Nadal, M},
  journal = {Behavioral Research Methods},
  month = {Feb},
  title = {A Set of 200 Musical Stimuli Varying in Balance, Contour, Symmetry, and Complexity: Behavioral and Computational Assessments.},
  url = {https://www.ncbi.nlm.nih.gov/pubmed/32052354},
  url = {http://springer/},
  year = {2020},
  abstract = {We present a novel set of 200 Western tonal musical stimuli (MUST) to be used in research on perception and appreciation of music. It consists of four subsets of 50 stimuli varying in balance, contour, symmetry, or complexity. All are 4 s long and designed to be musically appealing and experimentally controlled. We assessed them behaviorally and computationally. The behavioral assessment (Study 1) aimed to determine whether musically untrained participants could identify variations in each attribute. Forty-three participants rated the stimuli in each subset on the corresponding attribute. We found that inter-rater reliability was high and that the ratings mirrored the design features well. Participants' ratings also served to create an abridged set of 24 stimuli per subset. The computational assessment (Study 2) required the development of a specific battery of computational measures describing the structural properties of each stimulus. We distilled nonredundant composite measures for each attribute and examined whether they predicted participants' ratings. Our results show that the composite measures indeed predicted participants' ratings. Moreover, the composite complexity measure predicted complexity ratings as well as existing models of musical complexity. We conclude that the four subsets are suitable for use in studies that require presenting participants with short musical motifs varying in balance, contour, symmetry, or complexity, and that the stimuli and the computational measures are valuable resources for research in music psychology, empirical aesthetics, music information retrieval, and musicology. The MUST set and MATLAB toolbox codifying the computational measures are freely available at osf.io/bfxz7.},
  doi = {10.3758/s13428-019-01329-8},
  eissn = {1554-3528},
  keyword = {MIR},
  keyword = {aesthetics},
  keyword = {balance},
  keyword = {complexity},
  keyword = {contour},
  keyword = {music},
  keyword = {symmetry},
  language = {eng},
  pii = {10.3758/s13428-019-01329-8},
  day = {12},
  publicationstatus = {published}
}
@article{harrison2020aleadings,
  author = {Harrison, PMC and Pearce, MT},
  journal = {Music Perception},
  month = {Feb},
  number = {3},
  pages = {208--224},
  title = {A computational cognitive model for the analysis and generation of voice leadings},
  volume = {37},
  year = {2020},
  abstract = {© 2020 BY THE REGENTS OF THE UNIVERSITY OF CALIFORNIA ALL RIGHTS RESERVED. VOICE LEADING IS A COMMON TASK IN WESTERN music composition whose conventions are consistent with fundamental principles of auditory perception. Here we introduce a computational cognitive model of voice leading, intended both for analyzing voice-leading practices within encoded musical corpora and for generating new voice leadings for unseen chord sequences. This model is feature-based, quantifying the desirability of a given voice leading on the basis of different features derived fromHuron's (2001) perceptual account of voice leading. We use the model to analyze a corpus of 370 chorale harmonizations by J. S. Bach, and demonstrate the model's application to the voicing of harmonic progressions in different musical genres. The model is implemented in a new R package, "voicer," which we release alongside this paper.},
  doi = {10.1525/MP.2020.37.3.208},
  issn = {0730-7829},
  eissn = {1533-8312},
  day = {1},
  publicationstatus = {published}
}
@article{quirogamartinez2020musicalnonmusicians,
  author = {Quiroga-Martinez, DR and Hansen, NC and Højlund, A and Pearce, M and Brattico, E and Vuust, P},
  journal = {European Journal of Neuroscience},
  month = {Jan},
  publisher = {Wiley},
  title = {Musical prediction error responses similarly reduced by predictive uncertainty in musicians and non-musicians},
  year = {2020},
  abstract = {Abstract Auditory prediction error responses elicited by surprising sounds can be reliably recorded with musical stimuli that are more complex and realistic than those typically employed in EEG or MEG oddball paradigms. However, these responses are reduced as the predictive uncertainty of the stimuli increases. In this study, we investigate whether this effect is modulated by musical expertise. Magnetic mismatch negativity (MMNm) responses were recorded from 26 musicians and 24 non-musicians while they listened to low-and high-uncertainty melodic sequences in a musical multi-feature paradigm that included pitch, slide, intensity, and timbre deviants. When compared to non-musicians, musically trained participants had significantly larger pitch and slide MMNm responses. However, both groups showed comparable reductions of pitch and slide MMNm amplitudes in the high-uncertainty condition compared to the low-uncertainty condition. In a separate, behavioral deviance detection experiment, musicians were more accurate and confident about their responses than non-musicians, but deviance detection in both groups was similarly affected by the uncertainty of the melodies. In both experiments, the interaction between uncertainty and expertise was not significant, suggesting that the effect is comparable in both groups. Consequently, our results replicate the modulatory effect of predictive uncertainty on prediction error; show that it is present across different types of listeners; and suggest that expertise-related and stimulus-driven modulations of predictive precision are dissociable and independent.},
  doi = {10.1101/754333},
  issn = {0953-816X},
  day = {21},
  publicationstatus = {published}
}
@article{gregoromichelaki2020completabilityincompleteness,
  author = {Gregoromichelaki, E and Mills, G and Howes, C and Eshghi, A and Chatzikyriakidis, S and Purver, M and Kempson, R and Cann, R and Healey, P},
  journal = {Acta Linguistica Hafniensia},
  month = {Oct},
  publisher = {Taylor \& Francis (Routledge)},
  title = {Completability vs (In)completeness},
  url = {https://www.tandfonline.com/doi/full/10.1080/03740463.2020.1795549?instName=Queen+Mary\%2C+University+of+London},
  year = {2020},
  doi = {10.1080/03740463.2020.1795549},
  issn = {0374-0463},
  day = {22},
  publicationstatus = {published}
}
@article{shekhar2020automatingestonian,
  author = {Shekhar, R and Pranjić, M and Pollak, S and Pelicon, A and Purver, M},
  journal = {Journal of Language Technology and Computational Linguistics},
  month = {Sep},
  number = {1},
  pages = {49--79},
  publisher = {German Society for Computational Linguistics \& Language Technology},
  title = {Automating News Comment Moderation with Limited Resources: Benchmarking in Croatian and Estonian},
  url = {https://jlcl.org/content/2-allissues/1-heft1-2020/jlcl_2020-1_3.pdf},
  volume = {34},
  year = {2020},
  issn = {2190-6858},
  day = {8},
  publicationstatus = {published}
}
@article{lau2020howcontext,
  author = {Lau, JH and Armendariz, CS and Lappin, S and Purver, M and Shu, C},
  journal = {Trans. Assoc. Comput. Linguistics},
  month = {Jun},
  pages = {296--310},
  publisher = {MIT Press},
  title = {How Furiously Can Colourless Green Ideas Sleep? Sentence Acceptability in Context.},
  url = {https://www.mitpressjournals.org/doi/full/10.1162/tacl_a_00315},
  volume = {8},
  year = {2020},
  doi = {10.1162/tacl_a_00315},
  day = {22},
  publicationstatus = {published}
}
@article{Lau2020,
  author = {Lau, JH and Santos Armendariz, C and Lappin, S and Purver, M and Shu, C},
  title = {How Furiously Can Colourless Green Ideas Sleep? Sentence Acceptability in Context},
  journal = {Transactions of the Association for Computational Linguistics},
  year = {2020},
  volume = {8},
  pages = {296--310},
  month = {Jun},
  issn = {2307-387X},
  day = {17},
  doi = {10.1162/tacl_a_00315},
  publicationstatus = {accepted},
  publisher = {MIT Press},
  url = {http://www.eecs.qmul.ac.uk/~mpurver/papers/lau-et-al20tacl.pdf}
}
@inproceedings{rohanian2020multimodalspeech,
  author = {Rohanian, M and Hough, J and Purver, M},
  booktitle = {},
  month = {Jan},
  pages = {2187--2191},
  title = {Multi-Modal Fusion with Gating Using Audio, Lexical and Disfluency Features for Alzheimer’s Dementia Recognition from Spontaneous Speech},
  year = {2020},
  doi = {10.21437/interspeech.2020-2721},
  day = {1}
}
@article{saitis2020brightnesscategories,
  author = {Saitis, C and Siedenburg, K},
  journal = {The Journal of the Acoustical Society of America},
  month = {Oct},
  number = {4},
  pages = {2256--2266},
  publisher = {Acoustical Society of America},
  title = {Brightness perception for musical instrument sounds: Relation to timbre dissimilarity and source-cause categories},
  volume = {148},
  year = {2020},
  doi = {10.1121/10.0002275},
  issn = {0001-4966},
  language = {en},
  day = {21},
  publicationstatus = {published}
}
@inproceedings{delgadoluezas2020spectralsounds,
  author = {Delgado Luezas, A and Saitis, C and Sandler, M},
  booktitle = {},
  month = {Sep},
  title = {Spectral and Temporal Timbral Cues of Vocal Imitations of Drum Sounds},
  year = {2020},
  conference = {2nd International Conference on Timbre},
  day = {4},
  publicationstatus = {published}
}
@inproceedings{vahidi2020timbresynthesizer,
  author = {Vahidi, C and Fazekas, G and Saitis, C and Palladini, A},
  booktitle = {},
  month = {Sep},
  title = {Timbre Space Representation of a Subtractive Synthesizer},
  url = {http://timbre2020.mus.auth.gr/},
  year = {2020},
  startyear = {2020},
  startmonth = {Sep},
  startday = {3},
  finishyear = {2020},
  finishmonth = {Sep},
  finishday = {4},
  isbn = {978-960-99845-7-7},
  conference = {International Conference on Timbre (Timbre 2020)},
  day = {3},
  publicationstatus = {published}
}
@inproceedings{marinelli2020musicalspectra,
  author = {Marinelli, L and Lykartsis, A and Weinzierl, S and Saitis, C},
  booktitle = {},
  month = {Aug},
  organization = {Torino},
  title = {Musical dynamics classification with CNN and modulation spectra},
  year = {2020},
  startyear = {2020},
  startmonth = {Aug},
  startday = {24},
  finishyear = {2020},
  finishmonth = {Aug},
  finishday = {26},
  conference = {17th Sound and Music Computing Conference},
  day = {24},
  publicationstatus = {published}
}
@inproceedings{martinezramirez2020modelingnetwork,
  author = {Martinez Ramirez, M and Benetos, E and Reiss, J},
  booktitle = {},
  month = {May},
  organization = {Barcelona, Spain},
  pages = {241--245},
  publisher = {IEEE},
  title = {Modeling plate and spring reverberation using a DSP-informed deep neural network},
  url = {https://2020.ieeeicassp.org/},
  year = {2020},
  abstract = {Plate and spring reverberators are electromechanical systems first used and researched as means to substitute real room reverberation. Currently, they are often used in music production for aesthetic reasons due to their particular sonic characteristics. The modeling of these audio processors and their perceptual qualities is difficult since they use mechanical elements together with analog electronics resulting in an extremely complex response. Based on digital reverberators that use sparse FIR filters, we propose a signal processing-informed deep learning architecture for the modeling of artificial reverberators. We explore the capabilities of deep neural networks to learn such highly nonlinear electromechanical responses and we perform modeling of plate and spring reverberators. In order to measure the performance of the model, we conduct a perceptual evaluation experiment and we also analyze how the given task is accomplished and what the model is actually learning.},
  doi = {10.1109/ICASSP40776.2020.9053093},
  startyear = {2020},
  startmonth = {May},
  startday = {4},
  finishyear = {2020},
  finishmonth = {May},
  finishday = {8},
  conference = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2020)},
  day = {4},
  publicationstatus = {published}
}
@article{lee2020realtimeapplause,
  author = {Lee, JRR and Reiss, JD},
  journal = {AES: Journal of the Audio Engineering Society},
  month = {May},
  number = {5},
  pages = {261--272},
  title = {Real-Time sound synthesis of audience applause},
  volume = {68},
  year = {2020},
  abstract = {© 2020 Audio Engineering Society. All rights reserved. We investigate a procedural model for synthesizing applause sounds that contains novel aspects to ensure high quality and usability. Synthesis of a single clap is generated as a result of filtering a noise source and applying an envelope with exponential decay, based on prior art and existing experimental data. An ensemble approach is introduced to simulate many clappers in a spatially distributed environment. This renders how applause interacts with the space in which it is hosted, including the room impulse response, and where each clap is situated relative to the listener's position. The applause features realistic build-up and fadeout based on natural audience response. The implementation contains meaningful parameters that allow a user to configure and change the sound to achieve a multitude of different types of applause, such as an “enthusiasm parameter” to simulate the greater perceived intensity from an enthusiastic audience. Subjective evaluation was performed to compare our method against recorded samples and four other popular sound synthesis techniques. It showed that the proposed implementation produced significantly more realistic results than other forms of applause synthesis, and it was almost indistinguishable from real-life recordings.},
  doi = {10.17743/JAES.2020.0025},
  issn = {1549-4950},
  day = {1},
  publicationstatus = {published}
}
@article{martinezramirez2020deepeffects,
  author = {Martinez Ramirez, M and Benetos, E and Reiss, J},
  journal = {Applied Sciences},
  month = {Jan},
  number = {2},
  publisher = {MDPI AG},
  title = {Deep Learning for Black-Box Modeling of Audio Effects},
  url = {https://www.mdpi.com/journal/applsci},
  volume = {10},
  year = {2020},
  abstract = {Virtual analog modeling of audio effects consists of emulating the sound of an audio processor reference device. This digital simulation is normally done by designing mathematical models of these systems. It is often difficult because it seeks to accurately model all components within the effect unit, which usually contains various nonlinearities and time-varying components. Most existing methods for audio effects modeling are either simplified or optimized to a very specific circuit or type of audio effect and cannot be efficiently translated to other types of audio effects. Recently, deep neural networks have been explored as black-box modeling strategies to solve this task, i.e., by using only input–output measurements. We analyse different state-of-the-art deep learning models based on convolutional and recurrent neural networks, feedforward WaveNet architectures and we also introduce a new model based on the combination of the aforementioned models. Through objective perceptual-based metrics and subjective listening tests we explore the performance of these models when modeling various analog audio effects. Thus, we show virtual analog models of nonlinear effects, such as a tube preamplifier; nonlinear effects with memory, such as a transistor-based limiter and nonlinear time-varying effects, such as the rotating horn and rotating woofer of a Leslie speaker cabinet.},
  doi = {10.3390/app10020638},
  issn = {2076-3417},
  day = {16},
  publicationstatus = {published}
}
@inproceedings{peeters2020apostproduction,
  author = {Peeters, GG and Reiss, JD},
  booktitle = {148th Audio Engineering Society International Convention},
  month = {Jan},
  title = {A deep learning approach to sound classification for film audio post-production},
  year = {2020},
  abstract = {© 2020 148th Audio Engineering Society International Convention. All rights reserved. Audio post-production for film involves, among other things, the manipulation of large amounts of audio data. There is a clear need for the automation of many organization and classification tasks that are currently performed manually and repeatedly by sound engineers, such as grouping and renaming multiple audio recordings. Here, we present a method to classify such sound files in two categories, ambient recordings and single-source sounds or sound effects. Automating these organization tasks requires a deep learning model capable of answering questions about the nature of each sound recording based on specific stereo and monaural features. This study focuses on identifying these features and on the design of one possible model. The relevant features for this type of audio classification and the model specifications are discussed. In addition, an evaluation of the model is presented, resulting in high accuracy, precision and recall values for audio classification.},
  day = {1},
  publicationstatus = {published}
}
@article{hu2020tdcsos,
  author = {Hu, W and Ma, T and Wang, Y and Xu, F and Reiss, J},
  journal = {International Journal of Parallel, Emergent and Distributed Systems},
  month = {May},
  number = {3},
  pages = {396--411},
  title = {TDCS: a new scheduling framework for real-time multimedia OS},
  volume = {35},
  year = {2020},
  abstract = {© 2018, © 2018 Informa UK Limited, trading as Taylor  \&  Francis Group. The emerging real-time hyper-physical system (CPS), such as autonomous vehicle and live interactive media application, requires time deterministic behaviour. This is challenging to achieve by using the traditional general purpose operating system (GPOS). This paper presents a new design of the real-time operating system (OS) scheduling mechanism called ‘time deterministic cyclic scheduling’ (TDCS) mainly for live multimedia tasks processing. This new scheduler shares a similar philosophy as classic cyclic execution but with flexibility and dynamic configuration. This hybrid design is based on both time-reserved based cyclic execution and priority-based pre-emptive scheduling for mixed criticality applications. The simulation results show that this scheduling scheme can achieve predictable timing behaviour of task delay and jitter under high CPU utilisation. This shows that the proposed scheme is promising for low latency high-performance multimedia censoring tasks that occur in a periodic manner.},
  doi = {10.1080/17445760.2018.1539717},
  issn = {1744-5760},
  eissn = {1744-5779},
  day = {3},
  publicationstatus = {published}
}
@inproceedings{oconnor2020perceptualspacesvoice,
  author = {O’Connor, Brendan and Dixon, Simon and Fazekas, George},
  booktitle = {The 2020 Joint {AI} Conference on Music Creativity},
  month = {Oct},
  title = {An Exploratory Study on Perceptual Spaces of the Singing Voice},
  url = {https://boblsturm.github.io/aimusic2020/programme.html},
  volume = {1},
  isbn = {978-91-519-5560-5},
  abstract = {Sixty participants provided dissimilarity ratings between various singing techniques. Multidimensional scaling, class averaging and clustering techniques were used to analyse timbral spaces and how they change between different singers, genders and registers. Clustering analysis showed that ground-truth similarity and silhouette scores that were not significantly different between gender or register conditions, while similarity scores were positively correlated with participants’ instrumental abilities and task comprehension. Participant feedback showed how a revised study design might mitigate noise in our data, leading to more detailed statistical results. Timbre maps and class distance analysis showed us which singing techniques remained similar to one another across gender and register conditions. This research provides insight into how the timbre space of singing changes under different conditions, highlights the subjectivity of perception between participants, and provides generalised timbre maps for regularisation in machine learning.},
  eventtitle = {Proceedings of the 2020 Joint Conference on {AI} Creativity},
  year = {2020},
  location = {Stockholm, Sweden},
  day = {19},
  publicationstatus = {published}
}
@inproceedings{Sarmento2020,
  arxivid = {2006.12305v1},
  author = {Sarmento, Pedro and Holmqvist, Ove and Barthet, Mathieu},
  booktitle = {Proceedings of the 2020 Ubiquitous Music Workshop},
  title = {{Musical Smart City: Perspectives on Ubiquitous Sonification}},
  year = {2020},
  publicationstatus = {published},
  abstract = {Smart cities are urban areas with sensor networks that collect data used towards efficient management. As a source of ubiquitous data, smart city initiatives present opportunities to enhance inhabitants' urban awareness. However, making sense of smart city data is challenging and there is a gap between available data and end-user applications. Sonification emerges as a promising method for the interpretation of smart city data and the production of novel musical experiences. In this paper, we first present the smart city paradigm. We then cover the topics of ubiquitous and mobile music, followed by an overview of sonification research. Finally, we propose an approach entitled ubiquitous sonification and present the initial design of a speculative use case for musical smart city systems, leveraging user and urban data to inform behaviour.}
}
@comment{{jabref-meta: databaseType:bibtex;}}

This file was generated by bibtex2html 1.99.