pubs2021.bib

@article{pham2021cnnmoedetection,
  author = {Pham, LD and Phan, H and Palaniappan, R and Mertins, A and Mcloughlin, I},
  journal = {IEEE Journal of Biomedical and Health Informatics},
  month = {Mar},
  publisher = {Institute of Electrical and Electronics Engineers},
  title = {CNN-MoE based framework for classification of respiratory anomalies and lung disease detection.},
  url = {https://www.ncbi.nlm.nih.gov/pubmed/33684048},
  volume = {PP},
  year = {2021},
  abstract = {This paper presents and explores a robust deep learning framework for auscultation analysis. This aims to classify anomalies in respiratory cycles and detect diseases, from respiratory sound recordings. The framework begins with front-end feature extraction that transforms input sound into a spectrogram representation. Then, a back-end deep learning network is used to classify the spectrogram features into categories of respiratory anomaly cycles or diseases. Experiments, conducted over the ICBHI benchmark dataset of respiratory sounds, confirm three main contributions towards respiratory- sound analysis. Firstly, we carry out an extensive exploration of the effect of spectrogram types, spectral-time resolution, overlapping/non-overlapping windows, and data augmentation on final prediction accuracy. This leads us to propose a novel deep learning system, built on the proposed framework, which outperforms current state-of-the-art methods. Finally, we apply a Teacher-Student scheme to achieve a trade-off between model performance and model complexity which holds promise for building real-time applications.},
  doi = {10.1109/JBHI.2021.3064237},
  issn = {2168-2208},
  eissn = {2168-2208},
  language = {eng},
  day = {8},
  publicationstatus = {published}
}
@article{phan2021xsleepnetstaging,
  author = {Phan, H and Chen, OY and Tran, MC and Koch, P and Mertins, A and De Vos, M},
  journal = {IEEE Trans Pattern Anal Mach Intell},
  month = {Mar},
  organization = {United States},
  title = {XSleepNet: Multi-View Sequential Model for Automatic Sleep Staging.},
  url = {https://www.ncbi.nlm.nih.gov/pubmed/33788679},
  volume = {PP},
  year = {2021},
  abstract = {Automating sleep staging is vital to scale up sleep assessment and diagnosis to serve millions experiencing sleep deprivation and disorders and enable longitudinal sleep monitoring in home environments. This work proposes a sequence-to-sequence sleep staging model, XSleepNet, that is capable of learning a joint representation from both raw signals and time-frequency images. Since different views may generalize or overfit at different rates, the proposed network is trained such that the learning pace on each view is adapted based on their generalization/overfitting behavior. As a result, the network is able to retain the representation power of different views in the joint features which represent the underlying distribution better than those learned by each individual view alone. Furthermore, the XSleepNet architecture is principally designed to gain robustness to the amount of training data and to increase the complementarity between the input views. Experimental results on five databases of different sizes show that XSleepNet consistently outperforms the single-view baselines and the multi-view baseline with a simple fusion strategy. Finally, XSleepNet also outperforms prior sleep staging methods and improves previous state-of-the-art results on the experimental databases.},
  doi = {10.1109/TPAMI.2021.3070057},
  eissn = {1939-3539},
  language = {eng},
  day = {31},
  publicationstatus = {online-published}
}
@inproceedings{turchet2021musicalapproach,
  author = {Turchet, L and Baker, D and Stockman, T},
  booktitle = {IMX 2021 - Proceedings of the 2021 ACM International Conference on Interactive Media Experiences},
  month = {Jun},
  pages = {20--27},
  title = {Musical haptic wearables for synchronisation of visually-impaired performers: A co-design approach},
  year = {2021},
  abstract = {The emergence of new technologies is providing opportunities to develop novel solutions that facilitate the integration of visually-impaired people in different activities of our daily life, including collective music making. This paper presents a study conducted with visually-impaired music performers, which involved a participatory approach to the design of accessible technologies for musical communication in group playing. We report on three workshops that were conducted together with members of an established ensemble of solely visually-impaired musicians. The first workshop focused on the identification of the participants' needs during the activity of playing in groups and how technology could satisfy such needs. The second and third workshops investigated, respectively, the activities of choir singing and instrument playing in ensemble, focusing on the key issue of synchronisation that was identified in the first workshop. The workshops involved prototypes of musical haptic wearables, which were co-designed and evaluated by the participants. Overall, results indicate that wireless tactile communication represents a promising avenue to cater effectively to the needs of visually-impaired performers.},
  doi = {10.1145/3452918.3458803},
  isbn = {9781450383899},
  day = {21},
  publicationstatus = {published}
}
@inproceedings{vahidi2021atagging,
  author = {Vahidi, C and Fazekas, G and Saitis, C},
  booktitle = {},
  month = {Jul},
  title = {A Modulation Front-End for Music Audio Tagging},
  year = {2021},
  abstract = {Convolutional Neural Networks have been extensively explored in the task of automatic music tagging. The problem can be approached by using either engineered time-frequency features or raw audio as input. Modulation filter bank representations that have been actively researched as a basis for timbre perception have the potential to facilitate the extraction of perceptually salient features. We explore end-to-end learned front-ends for audio representation learning, ModNet and SincModNet, that incorporate a temporal modulation processing block. The structure is effectively analogous to a modulation filter bank, where the FIR filter center frequencies are learned in a data-driven manner. The expectation is that a perceptually motivated filter bank can provide a useful representation for identifying music features. Our experimental results provide a fully visualisable and interpretable front-end temporal modulation decomposition of raw audio. We evaluate the performance of our model against the state-of-the-art of music tagging on the MagnaTagATune dataset. We analyse the impact on performance for particular tags when time-frequency bands are subsampled by the modulation filters at a progressively reduced rate. We demonstrate that modulation filtering provides promising results for music tagging and feature representation, without using extensive musical domain knowledge in the design of this front-end.},
  startyear = {2021},
  startmonth = {Jul},
  startday = {18},
  finishyear = {2021},
  finishmonth = {Jul},
  finishday = {22},
  conference = {The International Joint Conference on Neural Networks},
  day = {18},
  publicationstatus = {accepted}
}
@inproceedings{nasreen2021rareclassdiagnosis,
  author = {Nasreen, S and HOUGH, J and Purver, M},
  booktitle = {},
  month = {Jul},
  title = {Rare-Class Dialogue Act Tagging for Alzheimer's Disease Diagnosis},
  url = {https://sigdial.org/sites/default/files/workshops/conference22/Proceedings/pdf/2021.sigdial-1.32.pdf},
  year = {2021},
  conference = {SIGDIAL: 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue},
  day = {29},
  publicationstatus = {published}
}
@inproceedings{karan2021mitigatingdialogue,
  author = {Karan, M and Khare, P and Healey, P and Purver, M},
  booktitle = {},
  month = {Jul},
  title = {Mitigating Topic Bias when Detecting Decisions in Dialogue},
  url = {https://sigdial.org/sites/default/files/workshops/conference22/Proceedings/pdf/2021.sigdial-1.56.pdf},
  year = {2021},
  conference = {SIGDIAL: 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue},
  day = {29},
  publicationstatus = {published}
}
@article{purver2021incrementalsemantics,
  author = {Purver, M and Sadrzadeh, M and Kempson, R and Wijnholds, G and Hough, J},
  journal = {Journal of Logic, Language and Information},
  month = {Jul},
  publisher = {Springer Verlag},
  title = {Incremental  Composition in Distributional Semantics},
  url = {https://doi.org/10.1007/s10849-021-09337-8},
  year = {2021},
  doi = {10.1007/s10849-021-09337-8},
  issn = {0925-8531},
  day = {7},
  publicationstatus = {published}
}
@article{nasreen2021alzheimersfeatures,
  author = {Nasreen, S and Rohanian, M and Hough, J and Purver, M},
  journal = {Frontiers in Computer Science},
  month = {Jun},
  pages = {640669--640669},
  title = {Alzheimer’s Dementia Recognition From Spontaneous Speech Using Disfluency and Interactional Features},
  volume = {3},
  year = {2021},
  doi = {10.3389/fcomp.2021.640669},
  issn = {2624-9898},
  day = {18},
  publicationstatus = {published}
}
@inproceedings{pelicon2021zeroshotdetection,
  author = {Pelicon, A and Shekhar, R and Martinc, M and Škrlj, B and Purver, M and Pollak, S},
  booktitle = {},
  month = {Apr},
  organization = {Kyiv (online)},
  pages = {30--34},
  title = {Zero-shot Cross-lingual Content Filtering: Offensive Language and Hate Speech Detection},
  year = {2021},
  startyear = {2021},
  startmonth = {Apr},
  startday = {19},
  finishyear = {2021},
  finishmonth = {Apr},
  finishday = {19},
  isbn = {978-1-954085-13-8},
  conference = {EACL workshop on News Media Content Analysis and Automated Report Generation},
  day = {19},
  publicationstatus = {accepted}
}
@inproceedings{pollak2021embeddiacontributions,
  author = {Pollak, S and Robnik-Šikonja, M and Purver, M and Boggia, M and Shekhar, R and Pranjić, M and Salmela, S and Krustok, I and Paju, T and Linden, C-G and Leppànen, L and Zosa, E and Ulčar, M and Freiental, L and Traat, S and Cabrera-Diego, LA and Martinc, M and Lavrač, N and Škrlj, B and Žnidaršič, M and Pelicon, A and Koloski, B and Podpečan, V and Kranjc, J and Sheehan, S and Boros, E and Moreno, J and Doucet, A and Toivonen, H},
  booktitle = {},
  month = {Apr},
  organization = {Kyiv (online)},
  pages = {99--109},
  title = {EMBEDDIA Tools, Datasets and Challenges: Resources and Hackathon Contributions},
  year = {2021},
  startyear = {2021},
  startmonth = {Apr},
  startday = {19},
  finishyear = {2021},
  finishmonth = {Apr},
  finishday = {19},
  isbn = {978-1-954085-13-8},
  conference = {EACL workshop on News Media Content Analysis and Automated Report Generation},
  day = {19},
  publicationstatus = {published}
}
@inproceedings{gan2021towardssubstitution,
  author = {Gan, Y and Chen, X and Huang, Q and Purver, M and Woodward, JR and Xie, J and Huang, P},
  booktitle = {ACL/IJCNLP (1)},
  editor = {Zong, C and Xia, F and Li, W and Navigli, R},
  pages = {2505--2515},
  publisher = {Association for Computational Linguistics},
  title = {Towards Robustness of Text-to-SQL Models against Synonym Substitution.},
  url = {https://aclanthology.org/volumes/2021.acl-long/},
  year = {2021},
  isbn = {978-1-954085-52-7}
}
@inproceedings{rohanian2021multimodalspeech,
  author = {Rohanian, M and Hough, J and Purver, M},
  booktitle = {CoRR},
  title = {Multi-modal fusion with gating using audio, lexical and disfluency features for Alzheimer's Dementia recognition from spontaneous speech.},
  volume = {abs/2106.09668},
  year = {2021}
}
@inproceedings{delbosquetrevinoinvestigatingcomponents,
  author = {Del-Bosque-Trevino, J and Purver, M and HOUGH, J},
  booktitle = {},
  organization = {Brandeis University, Waltham, MA, USA},
  title = {Investigating the Semantic Wave in Tutorial Dialogues: An Annotation Scheme and Corpus Study on Analogy Components.},
  year = {},
  startyear = {2020},
  startmonth = {Jul},
  startday = {18},
  finishyear = {2020},
  finishmonth = {Jul},
  finishday = {19},
  issn = {2308-2275},
  conference = {24th Workshop on the Semantics and Pragmatics of Dialogue (SemDial)},
  publicationstatus = {accepted}
}
@article{pelicon2021investigatingdetection,
  author = {Pelicon, A and Shekhar, R and Skrlj, B and Purver, M and Pollak, S},
  journal = {PeerJ Comput. Sci.},
  month = {Jun},
  pages = {e559--e559},
  title = {Investigating cross-lingual training for offensive language detection.},
  volume = {7},
  year = {2021},
  day = {25},
  publicationstatus = {published}
}
@article{tenderinireducedexposure,
  author = {Tenderini, M and De Leeuw, E and Eilola, T and Pearce, M},
  journal = {Journal of Experimental Psychology: Learning, Memory, and Cognition},
  publisher = {American Psychological Association},
  title = {Reduced cross-modal affective priming in the L2 of late bilinguals depends on L2 exposure},
  year = {},
  issn = {0278-7393},
  publicationstatus = {accepted}
}
@article{quirogamartinez2021musicianshipdetection,
  author = {Quiroga-Martinez, DR and Hansen, NC and Højlund, A and Pearce, M and Brattico, E and Holmes, E and Friston, K and Vuust, P},
  journal = {Human Brain Mapping},
  month = {Jan},
  title = {Musicianship and melodic predictability enhance neural gain in auditory cortex during pitch deviance detection},
  year = {2021},
  abstract = {When listening to music, pitch deviations are more salient and elicit stronger prediction error responses when the melodic context is predictable and when the listener is a musician. Yet, the neuronal dynamics and changes in connectivity underlying such effects remain unclear. Here, we employed dynamic causal modeling (DCM) to investigate whether the magnetic mismatch negativity response (MMNm)—and its modulation by context predictability and musical expertise—are associated with enhanced neural gain of auditory areas, as a plausible mechanism for encoding precision-weighted prediction errors. Using Bayesian model comparison, we asked whether models with intrinsic connections within primary auditory cortex (A1) and superior temporal gyrus (STG)—typically related to gain control—or extrinsic connections between A1 and STG—typically related to propagation of prediction and error signals—better explained magnetoencephalography responses. We found that, compared to regular sounds, out-of-tune pitch deviations were associated with lower intrinsic (inhibitory) connectivity in A1 and STG, and lower backward (inhibitory) connectivity from STG to A1, consistent with disinhibition and enhanced neural gain in these auditory areas. More predictable melodies were associated with disinhibition in right A1, while musicianship was associated with disinhibition in left A1 and reduced connectivity from STG to left A1. These results indicate that musicianship and melodic predictability, as well as pitch deviations themselves, enhance neural gain in auditory cortex during deviance detection. Our findings are consistent with predictive processing theories suggesting that precise and informative error signals are selected by the brain for subsequent hierarchical processing.},
  doi = {10.1002/hbm.25638},
  issn = {1065-9471},
  eissn = {1097-0193},
  day = {1},
  publicationstatus = {published}
}
@article{defleurian2021theanalysis,
  author = {de Fleurian, R and Pearce, MT},
  journal = {i-Perception},
  month = {Jul},
  number = {4},
  pages = {1--11},
  title = {The Relationship Between Valence and Chills in Music: A Corpus Analysis},
  volume = {12},
  year = {2021},
  abstract = {Chills experienced in response to music listening have been linked to both happiness and sadness expressed by music. To investigate these conflicting effects of valence on chills, we conducted a computational analysis on a corpus of 988 tracks previously reported to elicit chills, by comparing them with a control set of tracks matched by artist, duration, and popularity. We analysed track-level audio features obtained with the Spotify Web API across the two sets of tracks, resulting in confirmatory findings that tracks which cause chills were sadder than matched tracks and exploratory findings that they were also slower, less intense, and more instrumental than matched tracks on average. We also found that the audio characteristics of chills tracks were related to the direction and magnitude of the difference in valence between the two sets of tracks. We discuss these results in light of the current literature on valence and chills in music, provide a new interpretation in terms of personality correlates of musical preference, and review the advantages and limitations of our computational approach.},
  doi = {10.1177/20416695211024680},
  day = {27},
  publicationstatus = {published}
}
@article{hansenpredictiveperception,
  author = {Hansen, NC and Kragness, H and Vuust, P and Trainor, L and Pearce, M},
  journal = {Psychological Science},
  publisher = {SAGE Publications},
  title = {Predictive Uncertainty Underlies Auditory-Boundary Perception},
  year = {},
  issn = {0956-7976},
  publicationstatus = {accepted}
}
@article{clemente2021evaluativedesigns,
  author = {Clemente, A and Pearce, MT and Skov, M and Nadal, M},
  journal = {Brain and Cognition},
  month = {Jul},
  title = {Evaluative judgment across domains: Liking balance, contour, symmetry and complexity in melodies and visual designs},
  volume = {151},
  year = {2021},
  abstract = {Evaluative judgment—i.e., assessing to what degree a stimulus is liked or disliked—is a fundamental aspect of cognition, facilitating comparison and choosing among alternatives, deciding, and prioritizing actions. Neuroimaging studies have shown that evaluative judgment involves the projection of sensory information to the reward circuit. To investigate whether evaluative judgments are based on modality-specific or modality-general attributes, we compared the extent to which balance, contour, symmetry, and complexity affect liking responses in the auditory and visual modalities. We found no significant correlation for any of the four attributes across sensory modalities, except for contour. This suggests that evaluative judgments primarily rely on modality-specific sensory representations elaborated in the brain's sensory cortices and relayed to the reward circuit, rather than abstract modality-general representations. The individual traits art experience, openness to experience, and desire for aesthetics were associated with the extent to which design or compositional attributes influenced liking, but inconsistently across sensory modalities and attributes, also suggesting modality-specific influences.},
  doi = {10.1016/j.bandc.2021.105729},
  issn = {0278-2626},
  eissn = {1090-2147},
  day = {1},
  publicationstatus = {published}
}
@article{hall2021astructure,
  author = {Hall, ETR and Pearce, MT},
  journal = {Journal of New Music Research},
  month = {May},
  number = {3},
  pages = {220--241},
  publisher = {Informa UK Limited},
  title = {A model of large-scale thematic structure},
  volume = {50},
  year = {2021},
  abstract = {The coherent organisation of thematic material into large-scale structures within a composition is an important concept in both traditional and cognitive theories of music. However, empirical evidence supporting their perception is scarce. Providing a more nuanced approach, this paper introduces a computational model of hypothesised cognitive mechanisms underlying perception of large-scale thematic structure. Repetition detection based on statistical learning forms the model's foundation, hypothesising that predictability arising from repetition creates perceived thematic coherence. Measures are produced that characterise structural properties of a corpus of 623 monophonic compositions. Exploratory analysis reveals the extent to which these measures vary systematically and independently.},
  doi = {10.1080/09298215.2021.1930062},
  issn = {0929-8215},
  eissn = {1744-5027},
  language = {en},
  day = {27},
  publicationstatus = {published}
}
@article{harrison2021erratum101371journalpcbi1008304,
  author = {Harrison, PMC and Bianco, R and Chait, M and Pearce, MT},
  journal = {PLoS Computational Biology},
  month = {May},
  number = {5},
  title = {Erratum: PPM-Decay: A computational model of auditory prediction with memory decay (PLoS Comput Biol (2021) 16: 11 (e1008304) DOI: 10.1371/journal.pcbi.1008304)},
  volume = {17},
  year = {2021},
  abstract = {A grant acknowledgement was missing from the funding statement. The funding statement for this article should read as follows: “PH was supported by a doctoral studentship from the Engineering and Physical Sciences Research Council (EPSRC, https://epsrc.ukri.org/) and Arts and Humanities Research Council (AHRC, https://ahrc.ukri.org/) Centre for Doctoral Training in Media and Arts Technology (EP/L01632X/1). The research was additionally funded by a BBSRC grant (BB/P003745/1) to MC and supported by the NIHR UCLH BRC Deafness and Hearing Problems Theme. The funders did not play any role in the study design, data collection and analysis, decision to publish, or preparation of the manuscript.”},
  doi = {10.1371/journal.pcbi.1008995},
  issn = {1553-734X},
  eissn = {1553-7358},
  day = {1},
  publicationstatus = {published}
}
@article{clemente2021musicalsensitivity,
  author = {Clemente, A and Pearce, MT and Nadal, M},
  journal = {Psychology of Aesthetics Creativity and the Arts},
  month = {Mar},
  title = {Musical Aesthetic Sensitivity},
  year = {2021},
  doi = {10.1037/aca0000381},
  issn = {1931-3896},
  day = {18},
  publicationstatus = {published}
}
@article{krishnaneffectssequences,
  author = {Krishnan, S and Carey, D and Dick, F and Pearce, MT},
  journal = {Journal of Experimental Psychology: General},
  publisher = {American Psychological Association (APA)},
  title = {Effects of statistical learning in passive and active contexts on reproduction and recognition of auditory sequences.},
  year = {},
  doi = {10.1037/xge0001091},
  issn = {0096-3445},
  eissn = {1939-2222},
  language = {en},
  publicationstatus = {online-published}
}
@inproceedings{ozkan2021specificconversations,
  author = {Ozkan, EE and Gurion, T and Hough, J and Healey, PGT and Jamone, L},
  booktitle = {IEEE International Conference on Development and Learning, ICDL 2021},
  month = {Aug},
  title = {Specific hand motion patterns correlate to miscommunications during dyadic conversations},
  year = {2021},
  abstract = {Effective and natural communication is achieved by exchanging several multi-modal signals through highly coordinated communication mechanisms. These mechanisms are frequently subject to troubles of speaking in the form of disfluencies, typically followed by a self-repair from the speaker (i.e. to try to fix the misunderstanding): overall, these are signs of a possible miscommunication. Automatically detecting miscommunications is crucial to implement conversational agents, either digital or robotic, that could successfully interact with people. This can be done by searching for specific patterns across different communication channels, for example disfluencies in the speech signal or specific movements of the limbs. However, what are the motion patterns that correlate to miscommunications is still unclear. In this paper we report a human study in which we identify one of such patterns: in particular, we show that the hands of the speaker reliably move upwards during miscommunications. We performed a statistical analysis of synchronized speech and motion tracking data extracted from natural conversations of 15 dyads; our results show a statistically significant tendency of moving hands upwards during speech disfluencies, which are a clear sign of miscommunication.},
  doi = {10.1109/ICDL49984.2021.9515613},
  isbn = {9781728162423},
  day = {23},
  publicationstatus = {published}
}
@incollection{healey2021humanlikecommunication,
  author = {Healey, PGT},
  booktitle = {Human-Like Machine Intelligence},
  month = {Jul},
  pages = {137--151},
  title = {Human-like Communication},
  year = {2021},
  doi = {10.1093/oso/9780198862536.003.0007},
  day = {13}
}
@inproceedings{park2021shouldblush,
  author = {Park, S and Healey, PGT and Kaniadakis, A},
  booktitle = {Conference on Human Factors in Computing Systems - Proceedings},
  month = {May},
  title = {Should robots blush?},
  year = {2021},
  abstract = {Social interaction is the most complex challenge in daily life. Inevitably, social robots will encounter interactions that are outside their competence. This raises a basic design question: how can robots fail gracefully in social interaction? The characteristic human response to social failure is embarrassment. Usefully, embarrassment signals both recognition of a problem and typically enlists sympathy and assistance to resolve it. This could enhance robot acceptability and provides an opportunity for interactive learning. Using a speculative design approachwe explore how, when and why robots might communicate embarrassment. A series of specially developed cultural probes, scenario development and low-fdelity prototyping exercises suggest that: embarrassment is relevant for managing a diverse range of social scenarios, impacts on both humanoid and non-humanoid robot design, and highlights the critical importance of understanding interactional context. We conclude that embarrassment is fundamental to competent social functioning and provides a potentially fertile area for interaction design.},
  doi = {10.1145/3411764.3445561},
  isbn = {9781450380966},
  day = {6},
  publicationstatus = {published}
}
@article{nagele2021interactiveperformance,
  author = {Nagele, AN and Bauer, V and Healey, PGT and Reiss, JD and Cooke, H and Cowlishaw, T and Baume, C and Pike, C},
  journal = {Frontiers in Virtual Reality},
  month = {Feb},
  pages = {610320},
  title = {Interactive Audio Augmented Reality in Participatory Performance},
  volume = {1},
  year = {2021},
  doi = {10.3389/frvir.2020.610320},
  issn = {2673-4192},
  day = {12}
}
@article{skach2021sensingtrousers,
  author = {Skach, S and Stewart, R and Healey, PGT},
  journal = {IEEE Pervasive Computing},
  month = {Jul},
  number = {3},
  pages = {30--40},
  title = {Sensing Social Behavior with Smart Trousers},
  volume = {20},
  year = {2021},
  abstract = {Nonverbal signals play an important role in social interaction. Body orientation, posture, hand, and leg movements all contribute to successful communication, though research has typically focused on cues transmitted from the torso alone. Here, we explore lower body movements and address two issues. First, the empirical question of what social signals they provide. Second, the technical question of how these movements could be sensed unintrusively and in situations where traditional methods prove challenging. To approach these issues, we propose a soft, wearable sensing system for clothing. Bespoke smart trousers with embedded textile pressure sensors are designed and deployed in seated, multiparty conversations. Using simple machine learning techniques and evaluating individual and community models, our results show that it is possible to distinguish basic conversational states. With the trousers picking up speaking, listening, and laughing, they present an appropriate modality to ubiquitously sense human behavior.},
  doi = {10.1109/MPRV.2021.3088153},
  issn = {1536-1268},
  eissn = {1558-2590},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{zhao2021violinistfeatures,
  author = {Zhao, Y and Wang, C and Fazekas, G and Benetos, E and Sandler, M},
  booktitle = {},
  month = {Aug},
  publisher = {EURASIP},
  title = {Violinist identification based on vibrato features},
  url = {https://eusipco2021.org/},
  year = {2021},
  abstract = {Identifying performers from polyphonic music is a challenging task in music information retrieval. As a ubiquitous expressive element in violin music, vibrato contains important information about the performers' interpretation. This paper proposes to use vibrato features for identifying violinists from commercial orchestral recordings. We present and compare two systems, which take the same note-level melodies as input while using different vibrato feature extractors and classification schemes. One system calculates vibrato features according to vibrato definition, models the feature distribution using histograms, and classifies performers based on the distribution similarity. The other system uses the adaptive wavelet scattering which contains vibrato information and identifies violinists with a machine learning classifier. We report accuracy improvement of 19.8\% and 17.8\%, respectively, over a random baseline on piece-level evaluation. This suggests that vibrato notes in polyphonic music are useful for master violinist identification.},
  startyear = {2021},
  startmonth = {Aug},
  startday = {23},
  finishyear = {2021},
  finishmonth = {Aug},
  finishday = {27},
  conference = {29th European Signal Processing Conference (EUSIPCO)},
  day = {23},
  publicationstatus = {accepted}
}
@inproceedings{manco2021muscapsaudio,
  author = {Manco, I and Benetos, E and Quinton, E and Fazekas, G},
  booktitle = {},
  month = {Jul},
  publisher = {IEEE},
  title = {MusCaps: generating captions for music audio},
  url = {https://ilariamanco.com/},
  url = {https://www.ijcnn.org/},
  year = {2021},
  abstract = {Content-based music information retrieval has seen rapid progress with the adoption of deep learning. Current approaches to high-level music description typically make use of classification models, such as in auto tagging or genre and mood classification. In this work, we propose to address music description via audio captioning, defined as the task of generating a natural language description of music audio content in a human-like manner. To this end, we present the first music audio captioning model, MusCaps, consisting of an encoder-decoder with temporal attention. Our method combines convolutional and recurrent neural network architectures to jointly process audio-text inputs through a multimodal encoder and leverages pre-training on audio data to obtain representations that effectively capture and summarise musical features in the input. Evaluation of the generated captions through automatic metrics shows that our method outperforms a baseline designed for non-music audio captioning. Through an ablation study, we unveil that this performance boost can be mainly attributed to pre-training of the audio encoder, while other design choices – modality fusion, decoding strategy and the use of attention -- contribute only marginally. Our model represents a shift away from classification-based music description and combines tasks requiring both auditory and linguistic understanding to bridge the semantic gap in music information retrieval.},
  doi = {10.1109/IJCNN52387.2021.9533461},
  startyear = {2021},
  startmonth = {Jul},
  startday = {18},
  finishyear = {2021},
  finishmonth = {Jul},
  finishday = {22},
  conference = {International Joint Conference on Neural Networks (IJCNN)},
  day = {18},
  publicationstatus = {published}
}
@incollection{li2021achallenge,
  author = {Li, S and Jing, Y and Fazekas, G},
  booktitle = {},
  month = {Jan},
  pages = {177--186},
  title = {A Novel Dataset for the Identification of Computer Generated Melodies in the CSMT Challenge},
  volume = {761 LNEE},
  year = {2021},
  abstract = {This paper introduces a novel dataset for the identification of computer generated melodies as used in the data challenge organised by the Conference on Sound and Music Technology (CSMT). The CSMT data challenge requires participants to identify whether a given piece of melody is generated by computer or is composed by human. The dataset consists of two parts: a development dataset and an evaluation dataset. The development dataset contains only computer generated melodies whereas the evaluation dataset contain both computer generated melodies and human composed melodies. The aim of the dataset is to facilitate the develpment and assessment of methods to identified computer generated melodies and facilitate the creation of generative music systems.},
  doi = {10.1007/978-981-16-1649-5_15},
  isbn = {9789811616488},
  issn = {1876-1100},
  eissn = {1876-1119},
  day = {1},
  publicationstatus = {published}
}
@article{lefford2021contextawaresystems,
  author = {Lefford, MN and Bromham, G and Fazekas, G and Moffat, D},
  journal = {AES: Journal of the Audio Engineering Society},
  month = {Mar},
  number = {3},
  pages = {128--141},
  title = {Context-aware intelligent mixing systems},
  volume = {69},
  year = {2021},
  abstract = {Intelligent Mixing Systems (IMS) are rapidly becoming integrated into music mixing and production workflows. The intelligences of a human mixer and IMS can be distinguished by their abilities to comprehend, assess, and appreciate context. Humans will factor context into decisions, particularly concerning the use and application of technologies. The utility of an IMS depends on both its affordances and the situation in which it is to be used. The appropriate use for conventional purposes, or its utility for misappropriation, is determined by the context. This study considers how context impacts mixing decisions and the use of technology, focusing on how the mixer's understanding of context can inform the use of IMS, and how the use of IMS can aid in informing a mixer of different contexts.},
  doi = {10.17743/JAES.2020.0043},
  issn = {1549-4950},
  day = {1},
  publicationstatus = {published}
}
@article{gabrielli2021speciallistening,
  author = {Gabrielli, L and Fazekas, G and Nam, J},
  journal = {Applied Sciences (Switzerland)},
  month = {Jan},
  number = {2},
  pages = {1--4},
  title = {Special issue on deep learning for applications in acoustics: Modeling, synthesis, and listening},
  volume = {11},
  year = {2021},
  doi = {10.3390/app11020473},
  eissn = {2076-3417},
  day = {2},
  publicationstatus = {published}
}
@inproceedings{lbberssketchingassociations,
  author = {Löbbers, S and Barthet, M and Fazekas, G},
  booktitle = {},
  organization = {Santiago de Chile, Chile},
  title = {Sketching sounds: an exploratory study on sound-shape associations},
  year = {},
  abstract = {Sound synthesiser controls typically correspond to technical parameters of signal processing algorithms rather than intuitive sound descriptors that relate to human perception of sound. This makes it difficult to realise sound ideas in a straightforward way. Cross-modal mappings, for example between gestures and sound, have been suggested as a more intuitive control mechanism. A large body of research shows consistency in human associations between sounds and shapes. However, the use of drawings to drive sound synthesis has not been explored to its full extent. This pa- per presents an exploratory study that asked participants to sketch visual imagery of sounds with a monochromatic digital drawing interface, with the aim to identify different representational approaches and determine whether timbral sound characteristics can be communicated reliably through visual sketches. Results imply that the development of a synthesiser exploiting sound-shape associations is feasible, but a larger and more focused dataset is needed in followup studies.},
  startyear = {2021},
  startmonth = {Jul},
  startday = {25},
  finishyear = {2021},
  finishmonth = {Jul},
  finishday = {31},
  keyword = {timbre perception},
  keyword = {cross-modal associations},
  conference = {International Computer Music Conference},
  publicationstatus = {accepted}
}
@inproceedings{hayesneuralsynthesis,
  author = {Hayes, B and Saitis, C and Fazekas, G},
  booktitle = {},
  organization = {Online},
  title = {Neural Waveshaping Synthesis},
  url = {https://benhayes.net/},
  year = {},
  abstract = {We present the Neural Waveshaping Unit (NEWT): a novel, lightweight, fully causal approach to neural audio synthesis which operates directly in the waveform domain, with an accompanying optimisation (FastNEWT) for efficient CPU inference. The NEWT uses time-distributed multilayer perceptrons with periodic activations to implicitly learn nonlinear transfer functions that encode the characteristics of a target timbre. Once trained, a NEWT can produce complex timbral evolutions by simple affine transformations of its input and output signals. We paired the NEWT with a differentiable noise synthesiser and reverb and found it capable of generating realistic musical instrument performances with only 260k total model parameters, conditioned on F0 and loudness features. We compared our method to state-of-the-art benchmarks with a multi-stimulus listening test and the Fréchet Audio Distance and found it performed competitively across the tested timbral domains. Our method significantly outperformed the benchmarks in terms of generation speed, and achieved real-time performance on a consumer CPU, both with and without FastNEWT, suggesting it is a viable basis for future creative sound design tools.},
  startyear = {2021},
  startmonth = {Nov},
  startday = {8},
  finishyear = {2021},
  finishmonth = {Nov},
  finishday = {12},
  keyword = {neural audio synthesis},
  keyword = {signal processing},
  keyword = {deep learning},
  keyword = {machine learning},
  keyword = {audio synthesis},
  conference = {Proceedings of the 22nd International Society for Music Information Retrieval},
  publicationstatus = {accepted}
}
@inproceedings{graf2021anvisualisation,
  author = {Graf, M and Opara, HC and Barthet, M},
  booktitle = {},
  month = {Jun},
  title = {An Audio-Driven System for Real-Time Music Visualisation},
  url = {https://maxgraf.space/},
  year = {2021},
  abstract = {Computer-generated visualisations can accompany recorded or live music to create novel audiovisual experiences for audiences. We present a system to streamline the creation of audio-driven visualisations based on audio feature extraction and mapping interfaces. Its architecture is based on three modular software components: backend (audio plugin), frontend (3D game-like environment), and middleware (visual mapping interface). We conducted a user evaluation comprising two stages. Results from the first stage (34 participants) indicate that music visualisations generated with the system were significantly better at complementing the music than a baseline visualisation. Nine participants took part in the second stage involving interactive tasks. Overall, the system yielded a Creativity Support Index above average (68.1) and a System Usability Scale index (58.6) suggesting that ease of use can be improved. Thematic analysis revealed that participants enjoyed the system’s synchronicity and expressive capabilities, but found technical problems and difficulties understanding the audio feature terminology.},
  startyear = {2021},
  startmonth = {May},
  startday = {24},
  finishyear = {2021},
  finishmonth = {May},
  finishday = {28},
  keyword = {music visualisation},
  keyword = {mir},
  keyword = {audio features},
  conference = {Audio Engineering Society Convention 150},
  day = {24},
  publicationstatus = {published}
}
@article{yang2021examiningperformance,
  author = {Yang, S and Reed, CN and Chew, E and Barthet, M},
  journal = {IEEE Transactions on Affective Computing},
  month = {Jan},
  title = {Examining Emotion Perception Agreement in Live Music Performance},
  year = {2021},
  abstract = {Current music emotion recognition (MER) systems rely on emotion data averaged across listeners and over time to infer the emotion expressed by a musical piece, often neglecting time- and listener-dependent factors. These limitations can restrict the efficacy of MER systems and cause misjudgements. In a live music concert setting, fifteen audience members annotated perceived emotion in valence-arousal space over time using a mobile application. Analyses of inter-rater reliability yielded widely varying levels of agreement in the perceived emotions. A follow-up lab study to uncover the reasons for such variability was conducted, where twenty-one listeners annotated their perceived emotions through a recording of the original performance and offered open-ended explanations. Thematic analysis reveals many salient features and interpretations that can describe the cognitive processes. Some of the results confirm known findings of music perception and MER studies. Novel findings highlight the importance of less frequently discussed musical attributes, such as musical structure, performer expression, and stage setting, as perceived across different modalities. Musicians are found to attribute emotion change to musical harmony, structure, and performance technique more than non-musicians. We suggest that listener-informed musical features can benefit MER in addressing emotional perception variability by providing reasons for listener similarities and idiosyncrasies.},
  doi = {10.1109/TAFFC.2021.3093787},
  eissn = {1949-3045},
  day = {1},
  publicationstatus = {published}
}
@article{morfi2021deepevents,
  author = {Morfi, V and Lachlan, RF and Stowell, D},
  journal = {Journal of the Acoustical Society of America},
  month = {Jul},
  number = {1},
  pages = {2--11},
  title = {Deep perceptual embeddings for unlabelled animal sound events},
  volume = {150},
  year = {2021},
  abstract = {Evaluating sound similarity is a fundamental building block in acoustic perception and computational analysis. Traditional data-driven analyses of perceptual similarity are based on heuristics or simplified linear models, and are thus limited. Deep learning embeddings, often using triplet networks, have been useful in many fields. However, such networks are usually trained using large class-labelled datasets. Such labels are not always feasible to acquire. We explore data-driven neural embeddings for sound event representation when class labels are absent, instead utilising proxies of perceptual similarity judgements. Ultimately, our target is to create a perceptual embedding space that reflects animals' perception of sound. We create deep perceptual embeddings for bird sounds using triplet models. In order to deal with the challenging nature of triplet loss training with the lack of class-labelled data, we utilise multidimensional scaling (MDS) pretraining, attention pooling, and a triplet mining scheme. We also evaluate the advantage of triplet learning compared to learning a neural embedding from a model trained on MDS alone. Using computational proxies of similarity judgements, we demonstrate the feasibility of the method to develop perceptual models for a wide range of data based on behavioural judgements, helping us understand how animals perceive sounds.},
  doi = {10.1121/10.0005475},
  issn = {0001-4966},
  eissn = {1520-8524},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{bodo2021aidentification,
  author = {Bodo, RPP and Benetos, E and Queiroz, M},
  booktitle = {},
  month = {Nov},
  organization = {Tokyo, Japan},
  title = {A framework for music similarity and cover song identification},
  url = {https://www.cmmr2021.gttm.jp/},
  year = {2021},
  abstract = {This paper presents a framework for music information retrieval tasks which relate to music similarity. The framework is based on a pipeline consisting of audio feature extraction, feature aggregation and distance measurements, which generalizes previous work and includes hundreds of similarity models not previously considered in the literature. This general pipeline is subjected to a comprehensive benchmark of analogously defined music similarity models over the task of cover song identification. Experimental results provide scientific evidence for certain preferred combined choices of features, aggregations and distances, while pointing towards novel combinations of such elements with the potential to improve the performance of music similarity models on specific MIR tasks.},
  startyear = {2021},
  startmonth = {Nov},
  startday = {15},
  finishyear = {2021},
  finishmonth = {Nov},
  finishday = {19},
  conference = {15th International Symposium on Computer Music Multidisciplinary Research (CMMR)},
  day = {15},
  publicationstatus = {accepted}
}
@inproceedings{viannalordelo2021pitchinformedshapes,
  author = {Vianna Lordelo, C and Benetos, E and Dixon, S and Ahlbäck, S},
  booktitle = {},
  month = {Nov},
  title = {Pitch-informed instrument assignment using a deep convolutional network with multiple kernel shapes},
  url = {https://cpvlordelo.github.io/},
  url = {https://ismir2021.ismir.net/},
  year = {2021},
  abstract = {This paper proposes a deep convolutional neural network for performing note-level instrument assignment. Given a polyphonic multi-instrumental music signal along with its ground truth or predicted notes, the objective is to assign an instrumental source for each note. This problem is addressed as a pitch-informed classification task where each note is analysed individually. We also propose to utilise several kernel shapes in the convolutional layers in order to facilitate learning of timbre-discriminative feature maps. Experiments on the MusicNet dataset using 7 instrument classes show that our approach is able to achieve an average F-score of 0.904 when the original multi-pitch annotations are used as the pitch information for the system, and that it also excels if the note information is provided using third-party multi-pitch estimation algorithms. We also include ablation studies investigating the effects of the use of multiple kernel shapes and comparing different input representations for the audio and the note-related information.},
  startyear = {2021},
  startmonth = {Nov},
  startday = {9},
  finishyear = {2021},
  finishmonth = {Nov},
  finishday = {12},
  conference = {22nd International Society for Music Information Retrieval Conference (ISMIR)},
  day = {9},
  publicationstatus = {accepted}
}
@inproceedings{ozaki2021agreementsongs,
  author = {Ozaki, Y and McBride, J and Benetos, E and Pfordresher, PQ and Six, J and T. Tierney, A and Proutskova, P and Sakai, E and Kondo, H and Fukatsu, H and Fujii, S and Savage, PE},
  booktitle = {},
  month = {Nov},
  publisher = {International Society for Music Information Retrieval},
  title = {Agreement among human and annotated transcriptions of global songs},
  url = {https://ismir2021.ismir.net/},
  year = {2021},
  abstract = {Cross-cultural musical analysis requires standardized symbolic representation of sounds such as score notation. However, transcription into notation is usually conducted manually by ear, which is time-consuming and subjective. Our aim is to evaluate the reliability of existing methods for transcribing songs from diverse societies. We had 3 experts independently transcribe a sample of 32 excerpts of traditional monophonic songs from around the world (half a cappella, half with instrumental accompaniment). 16 songs also had pre-existing transcriptions created by 3 different experts. We compared these human transcriptions against one another and against 10 automatic music transcription algorithms. We found that human transcriptions can be sufficiently reliable (~90\% agreement, κ ~.7), but current automated methods are not (<60\% agreement, κ <.4). No automated method clearly outperformed others, in contrast to our predictions. These results suggest that improving automated methods for cross-cultural music transcription is critical for diversifying MIR.},
  startyear = {2021},
  startmonth = {Nov},
  startday = {9},
  finishyear = {2021},
  finishmonth = {Nov},
  finishday = {12},
  conference = {22nd International Society for Music Information Retrieval Conference (ISMIR)},
  day = {9},
  publicationstatus = {accepted}
}
@inproceedings{ohanlon2021detectingnetworks,
  author = {O'Hanlon, K and Benetos, E and Dixon, S},
  booktitle = {},
  month = {Oct},
  organization = {Gold Coast, Queensland, Australia},
  publisher = {IEEE},
  title = {Detecting cover songs with pitch class key-invariant networks},
  url = {https://2021.ieeemlsp.org/},
  year = {2021},
  abstract = {Deep Learning (DL) has recently been applied successfully to the task of Cover Song Identification (CSI). Meanwhile, neural networks that consider music signal data structure in their design have been developed. In this paper, we propose a Pitch Class Key-Invariant Network, PiCKINet, for CSI.  Like some other CSI networks, PiCKINet inputs a Constant-Q Transform (CQT) pitch feature.  Unlike other such networks, large multi-octave kernels produce a latent representation with pitch class dimensions that are maintained throughout PiCKINet by key-invariant convolutions.
PiCKINet is seen to be more effective, and efficient, than other CQT-based networks.  We also propose an extended variant, PiCKINet+, that employs a centre loss penalty, squeeze and excite units, and octave swapping data augmentation.  PiCKINet+ shows an improvement of ~17\% MAP relative to the well-known CQTNet when tested on a set of ~16K tracks.},
  startyear = {2021},
  startmonth = {Oct},
  startday = {25},
  finishyear = {2021},
  finishmonth = {Oct},
  finishday = {28},
  conference = {IEEE International Workshop on Machine Learning for Signal Processing (MLSP)},
  day = {25},
  publicationstatus = {accepted}
}
@inproceedings{sarkar2021vocalnetworks,
  author = {Sarkar, S and Benetos, E and Sandler, M},
  booktitle = {},
  month = {Aug},
  organization = {Brno, Czech Republic},
  pages = {3515--3519},
  title = {Vocal Harmony Separation using Time-domain Neural Networks},
  year = {2021},
  abstract = {Polyphonic vocal recordings are an inherently challenging source separation task due to the melodic structure of the vocal parts and unique timbre of its constituents. In this work, we utilise a time-domain neural network architecture re-purposed from speech separation research and modify it to separate a capella mixtures at a high sampling rate. We use four-part (soprano, alto, tenor and bass) a capella recordings of Bach Chorales and Barbershop Quartets for our experiments. Unlike current deep learning based choral separation models where the training objective is to separate constituent sources based on their class, we train our model using a permutation invariant objective. Using this we achieve state-of-the-art results for choral music separation. We introduce a novel method to estimate harmonic overlap between sung musical notes as a measure of task complexity. We also present an analysis of the impact of randomised mixing, input lengths and filterbank lengths for our task. Our results show a moderate negative correlation between the harmonic overlap of the target sources and source separation performance. We report that training our models with randomly mixed musically-incoherent mixtures drastically reduces the performance of vocal harmony separation as it decreases the average harmonic overlap presented during training.},
  doi = {10.21437/Interspeech.2021-1531},
  startyear = {2021},
  startmonth = {Aug},
  startday = {30},
  finishyear = {2021},
  finishmonth = {Sep},
  finishday = {3},
  conference = {22nd Annual Conference of the International Speech Communication Association (INTERSPEECH)},
  day = {30},
  publicationstatus = {published}
}
@inproceedings{bear2021angeotagging,
  author = {Bear, H and Morfi, V and Benetos, E},
  booktitle = {},
  month = {Aug},
  organization = {Brno, Czech Republic},
  pages = {581--585},
  publisher = {International Speech and Communication Association (ISCA)},
  title = {An evaluation of data augmentation methods for sound scene geotagging},
  url = {https://www.interspeech2021.org/},
  year = {2021},
  abstract = {Sound scene geotagging is a new topic of research which has evolved from acoustic scene classification. It is motivated by the idea of audio surveillance. Not content with only describing a scene in a recording, a machine which can locate where the recording was captured would be of use to many. In this paper we explore a series of common audio data augmentation methods to evaluate which best improves the accuracy of audio geotagging classifiers. Our work improves on the state-of-the-art city geotagging method by 23\% in terms of classification accuracy.},
  doi = {10.21437/Interspeech.2021-1837},
  startyear = {2021},
  startmonth = {Aug},
  startday = {30},
  finishyear = {2021},
  finishmonth = {Sep},
  finishday = {3},
  conference = {22nd Annual Conference of the International Speech Communication Association (INTERSPEECH)},
  day = {30},
  publicationstatus = {published}
}
@incollection{liu2021fromnotation,
  address = {Cham, Switzerland},
  author = {Liu, L and Benetos, E},
  booktitle = {Handbook of Artificial Intelligence for Music},
  edition = {1st},
  editor = {Miranda, ER},
  month = {Aug},
  number = {24},
  pages = {693--714},
  publisher = {Springer International Publishing},
  series = {Artificial Intelligence},
  title = {From Audio to Music Notation},
  url = {https://cheriell.github.io/},
  url = {https://link.springer.com/chapter/10.1007/978-3-030-72116-9_24},
  year = {2021},
  abstract = {The field of Music Information Retrieval (MIR) focuses on creating methods and practices for making sense of music data from various modalities, including audio, video, images, scores and metadata. Within MIR, a core problem which to the day remains open is Automatic Music Transcription (AMT), the process of automatically converting an acoustic music signal into some form of musical notation. The creation of a method for automatically converting musical audio to notation has several uses including but also going beyond MIR: from software for automatic typesetting of audio into staff notation or other music representations, to the use of automatic transcriptions as a descriptor towards the development of systems for music recommendation, to applications for interactive music systems such as automatic music accompaniment, for music education through methods for automatic instrument tutoring, and towards enabling musicological research in sound archives, to name but a few.},
  doi = {10.1007/978-3-030-72116-9_24},
  isbn = {978-3-030-72116-9},
  keyword = {music information retrieval},
  keyword = {artificial intelligence},
  keyword = {automatic music transcription},
  numberofpieces = {34},
  day = {3},
  publicationstatus = {published}
}
@inproceedings{cheuk2021revisitingattention,
  author = {Cheuk, KW and Luo, Y-J and Benetos, E and Herremans, D},
  booktitle = {},
  month = {Jul},
  publisher = {IEEE},
  title = {Revisiting the onsets and frames model with additive attention},
  url = {https://www.ijcnn.org/},
  year = {2021},
  abstract = {Recent advances in automatic music transcription (AMT) have achieved highly accurate polyphonic piano transcription results by incorporating onset and offset detection. The existing literature, however, focuses mainly on the leverage of deep and complex models to achieve state-of-the-art (SOTA) accuracy, without understanding model behaviour. In this paper, we conduct a comprehensive examination of the Onsets-and-Frames AMT model, and pinpoint the essential components contributing to a strong AMT performance. This is achieved through exploitation of a modified additive attention mechanism. The experimental results suggest that the attention mechanism beyond a moderate temporal context does not benefit the model, and that rule-based post-processing is largely responsible for the SOTA performance. We also demonstrate that the onsets are the most significant attentive feature regardless of model complexity. The findings encourage AMT research to weigh more on both a robust onset detector and an effective post-processor.},
  doi = {10.1109/IJCNN52387.2021.9533407},
  startyear = {2021},
  startmonth = {Jul},
  startday = {18},
  finishyear = {2021},
  finishmonth = {Jul},
  finishday = {22},
  conference = {International Joint Conference on Neural Networks (IJCNN)},
  day = {18},
  publicationstatus = {published}
}
@inproceedings{ragano2021moreannotations,
  author = {Ragano, A and Benetos, E and Hines, A},
  booktitle = {https://qomex2021.itec.aau.at/},
  month = {Jun},
  title = {More for Less: Non-Intrusive Speech Quality Assessment with Limited Annotations},
  year = {2021},
  abstract = {Non-intrusive speech quality assessment is a crucial operation in multimedia applications. The scarcity of annotated data and the lack of a reference signal represent some of the main challenges for designing efficient quality assessment metrics. In this paper, we propose two multi-task models to tackle the problems above. In the first model, we first learn a feature representation with a degradation classifier on a large dataset. Then we perform MOS prediction and degradation classification simultaneously on a small dataset annotated with MOS. In the second approach, the initial stage consists of learning features with a deep clustering-based unsupervised feature representation on the large dataset. Next, we perform MOS prediction and cluster label classification simultaneously on a small dataset. The results show that the deep clustering-based model outperforms the degradation classifier-based model and  the 3 baselines (autoencoder features, P.563, and SRMRnorm) on TCD-VoIP. This paper indicates that multi-task learning combined with feature representations from unlabelled data is a promising approach to deal with the lack of large MOS annotated datasets.},
  doi = {10.1109/QoMEX51781.2021.9465410},
  startyear = {2021},
  startmonth = {Jun},
  startday = {14},
  finishyear = {2021},
  finishmonth = {Jun},
  finishday = {17},
  conference = {13th International Conference on Quality of Multimedia Experience (QoMEX)},
  day = {14},
  publicationstatus = {published}
}
@inproceedings{singh2021prototypicalclassification,
  author = {Singh, S and Bear, H and Benetos, E},
  booktitle = {},
  month = {Jun},
  organization = {Toronto, Canada},
  publisher = {IEEE},
  title = {Prototypical Networks for Domain Adaptation in Acoustic Scene Classification},
  url = {http://www.eecs.qmul.ac.uk/profiles/singhshubhr.html},
  url = {https://2021.ieeeicassp.org/},
  year = {2021},
  abstract = {Acoustic Scene Classification (ASC) refers to the task of assigning a semantic label to an audio stream that characterizes the environment in which it was recorded. In recent times, Deep Neural Networks (DNNs) have emerged as the model of choice for ASC. However, in real world scenarios, domain adaptation remains a persistent problem for ASC models. In the search for an optimal solution to the said problem, we explore a metric learning approach called prototypical networks using the TUT Urban Acoustic Scenes dataset, which consists of 10 different acoustic scenes recorded across 10 cities. In order to replicate the domain adaptation scenario, we divide the dataset into source domain data consisting of data samples from eight randomly selected cities and target domain data consisting of data from the remaining two cities. We evaluate the performance of the network against a selected baseline network under various experimental scenarios and based on the results we conclude that metric learning is a promising approach towards addressing the  domain adaptation problem in ASC.},
  doi = {10.1109/ICASSP39728.2021.9414876},
  startyear = {2021},
  startmonth = {Jun},
  startday = {6},
  finishyear = {2021},
  finishmonth = {Jun},
  finishday = {11},
  keyword = {metric learning},
  keyword = {domain adaptation},
  keyword = {acoustic scene classification},
  keyword = {episodic training},
  conference = {IEEE International Conference on Acoustics, Speech and Signal Processing},
  day = {6},
  publicationstatus = {published}
}
@inproceedings{liu2021jointmusic,
  author = {Liu, L and Morfi, G-V and Benetos, E},
  booktitle = {},
  month = {Jun},
  organization = {Toronto, Canada},
  publisher = {IEEE},
  title = {Joint multi-pitch detection and score transcription for polyphonic piano music},
  url = {https://cheriell.github.io/},
  url = {https://2021.ieeeicassp.org/},
  year = {2021},
  abstract = {Research on automatic music transcription has largely focused on multi-pitch detection; there is limited discussion on how to obtain a machine- or human-readable score transcription. In this paper, we propose a method for joint multi-pitch detection and score transcription for polyphonic piano music. The outputs of our system include both a piano-roll representation (a descriptive transcription) and a symbolic musical notation (a prescriptive transcription). Unlike traditional methods that further convert MIDI transcriptions into musical scores, we use a multitask model combined with a Convolutional Recurrent Neural Network and Sequence-to-sequence models with attention mechanisms. We propose a Reshaped score representation that outperforms a LilyPond representation in terms of both prediction accuracy and time/memory resources, and compare different input audio spectrograms. We also create a new synthesized dataset for score transcription research. Experimental results show that the joint model outperforms a single-task model in score transcription.},
  doi = {10.1109/ICASSP39728.2021.9413601},
  startyear = {2021},
  startmonth = {Jun},
  startday = {6},
  finishyear = {2021},
  finishmonth = {Jun},
  finishday = {11},
  keyword = {automatic music transcription},
  keyword = {sequence-to-sequence models},
  keyword = {score transcription},
  conference = {IEEE International Conference on Acoustics, Speech and Signal Processing},
  day = {6},
  publicationstatus = {published}
}
@inproceedings{subramanian2021anomalousmethods,
  author = {Subramanian, V and Gururani, S and Benetos, E and Sandler, M},
  booktitle = {},
  month = {May},
  title = {Anomalous behaviour in loss-gradient based interpretability methods},
  year = {2021},
  conference = {RobustML workshop paper at ICLR 2021},
  day = {7},
  publicationstatus = {accepted}
}
@article{viannalordelo2021adversarialseparation,
  author = {Vianna Lordelo, C and Benetos, E and Dixon, S and Ahlbäck, S and Ohlsson, P},
  journal = {IEEE Signal Processing Letters},
  month = {Jan},
  pages = {81--85},
  publisher = {Institute of Electrical and Electronics Engineers},
  title = {Adversarial Unsupervised Domain Adaptation for Harmonic-Percussive Source Separation},
  volume = {28},
  year = {2021},
  abstract = {This paper addresses the problem of domain adaptation for the task of music source separation. Using datasets from two different domains, we compare the performance of a deep learning-based harmonic-percussive source separation model under different training scenarios, including supervised joint training using data from both domains and pre-training in one domain with fine-tuning in another. We propose an adversarial unsupervised domain adaptation approach suitable for the case where no labelled data (ground-truth source signals) from a target domain is available. By leveraging unlabelled data (only mixtures) from this domain, experiments show that our framework can improve separation performance on the new domain without losing any considerable performance on the original domain. The paper also introduces the Tap \& Fiddle dataset, a dataset containing recordings of Scandinavian fiddle tunes along with isolated tracks for "foot-tapping" and "violin".},
  doi = {10.1109/LSP.2020.3045915},
  issn = {1070-9908},
  day = {1},
  publicationstatus = {published}
}
@article{holzapfelhumanitiestranscription,
  author = {Holzapfel, A and Benetos, E and Killick, A and Widdess, R},
  journal = {Digital Scholarship in the Humanities},
  publisher = {Oxford University Press (OUP)},
  title = {Humanities and Engineering Perspectives on Music Transcription},
  url = {https://academic.oup.com/dsh},
  year = {},
  abstract = {Music transcription is a process of creating a notation of musical sounds. It has been used as a basis for the analysis of music from a wide variety of cultures. Recent decades have seen an increasing amount of engineering research within the field of Music Information Retrieval (MIR) that aims at automatically obtaining music transcriptions in Western staff notation. However, such approaches are not widely applied in research in ethnomusicology. This paper aims to bridge interdisciplinary gaps by identifying aspects of proximity and divergence between the two fields. As part of our study, we collected manual transcriptions of traditional dance tune recordings by 18 transcribers. Our method employs a combination of expert and computational evaluation of these transcriptions. This enables us to investigate the limitations of automatic music transcription (AMT) methods and computational transcription metrics that have been proposed for their evaluation. Based on these findings, we discuss promising avenues to make AMT more useful for studies in the Humanities. These are, first, assessing the quality of a transcription based on an analytic purpose  second, developing AMT approaches that are able to learn conventions concerning the transcription of a specific style, third, a focus on novice transcribers as users of AMT systems, and, finally, considering target notation systems different from Western staff notation.},
  issn = {2055-7671},
  keyword = {music transcription},
  keyword = {ethnomusicology},
  keyword = {music information retrieval},
  keyword = {music notation},
  publicationstatus = {accepted}
}
@article{robsononpractitioners,
  author = {Robson, N and Bryan-Kinns, N and Mcpherson, A},
  journal = {Organised Sound: an international journal of music and technology},
  number = {1},
  publisher = {Cambridge University Press (CUP)},
  title = {On mediating space, sound and experience: interviews with situated sound art practitioners},
  volume = {28},
  year = {},
  abstract = {This article reports on an interview-based study with ten sound artists and composers, all engaged in situated sonic practices. We propose that these artists engage the ear and shape possible interactions with the artwork by altering the relationship between sound, the space in which it is heard and the people who hear it. Our interviews probe the creative process and explore how a sound artist’s methods and tools might influence the reception of their work. A thematic analysis of interview transcriptions leads us to characterise artist processes as mediatory, in the sense that they act in-between site and audience experience and are guided by the nonhuman agencies of settings and material things. We propose that artists transfer their own situated and embodied listening to that of the audience and develop sonic and staging devices to direct perceptual activity and listening attention. Our findings also highlight a number of engagement challenges, in particular the difficulty artists face in understanding their audience’s experience and the specificity of an artwork’s effect to not just its location, but to the disposition, abilities and prior experiences of listeners.},
  issn = {1355-7718},
  publicationstatus = {accepted}
}
@article{jing2021theuse,
  author = {Jing, C and Bryan-Kinns, N and Yang, S and Zhi, J and Zhang, J},
  journal = {International Journal of Industrial Ergonomics},
  month = {Jul},
  title = {The influence of mobile phone location and screen orientation on driving safety and the usability of car-sharing software in-car use},
  volume = {84},
  year = {2021},
  abstract = {One of the important parts of the Internet economy is the car-sharing economy. Nevertheless, a few numbers of studies have been dedicated to the issues of safety of driving and usability resulting from the car-sharing software application. The current study aims at analyzing the effect of phone location and screen orientation on usability and safety of driving in Chinese drivers who use car-sharing software (e.g., UBER) and comparing the differences in the safety of driving and usability among various car-sharing tasks. To this end, 24 experienced Chinese Uber drivers were employed using a driving simulator, and three tasks of car-sharing software were investigated. The variables of driving safety and usability were analyzed by Repeat Measurement ANOVA as well as Two-way ANOVA. It was found that when the phone is located on the left side of the steering wheel, usability and driving safety were better compared to locating it on the right side. The orientation of the mobile phone screen had a major impact on usability, while its impact on driving safety was trivial. The left-portrait mode showed the best performance. No significant impact on usability was found in terms of tasks (T1: ordering; T2: route checking; T3: destination search), while tasks showed a significant effect on driving safety. Among these tasks, the T3 task showed the highest impact on driving safety. The T2 task was in the second rank, followed by the T1 task (T1 < T2 < T3). The findings obtained from Chinese participants were different from the findings obtained in Japan and the United States. Relevance to industry: This finding can be used as a basis for the optimization design of ca-sharing software and as a foundation for improving the safety and usability of car-sharing software used by drivers in the car.},
  doi = {10.1016/j.ergon.2021.103168},
  issn = {0169-8141},
  eissn = {1872-8219},
  day = {1},
  publicationstatus = {published}
}
@incollection{fang2021usingdisease,
  author = {Fang, Y and Ou, J and Bryan-Kinns, N and Kang, Q and Zhang, J and Guo, B},
  booktitle = {},
  month = {Jan},
  pages = {353--361},
  title = {Using Vibrotactile Device in Music Therapy to Support Wellbeing for People with Alzheimer’s Disease},
  volume = {261},
  year = {2021},
  abstract = {Music therapy is regarded as a non-pharmacological treatment to improve cognitive, psychological and behavioural alterations in patients with Alzheimer’s disease (AD). However, the general hearing loss in AD patients decreases their musical perception and the efficacy of music therapy. This paper presents a wearable vibrotactile prototype, a multisensory system that translates music into vibrotactile stimuli. We conducted an exploratory study with 8 older adults with Alzheimer’s disease to evaluate whether the prototype can improve the therapeutic effects of music therapy. Qualitative and quantitative analysis were utilized to explore different aspects of the prototype, and results showed that combining music therapy and vibrotactile stimuli leads to obvious improvement in cognitive scores. Results also demonstrated the contribution of vibrotactile stimuli in promoting positive emotions and rhythmic behaviors in elderly patients with Alzheimer’s disease, and the fingertips are suitable body parts for wearable vibration devices.},
  doi = {10.1007/978-3-030-79760-7_43},
  isbn = {9783030797591},
  issn = {2367-3370},
  eissn = {2367-3389},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{soave2021exploringreality,
  author = {Soave, F and Padma Kumar, A and Bryan-Kinns, N and Farkhatdinov, I},
  booktitle = {DIS 2021 - Proceedings of the 2021 ACM Designing Interactive Systems Conference: Nowhere and Everywhere},
  month = {Jun},
  pages = {171--179},
  title = {Exploring Terminology for Perception of Motion in Virtual Reality},
  year = {2021},
  abstract = {A key aspect of Virtual Reality (VR) applications is the ability to move in the environment, which relies on the illusion of self-motion to create a good user experience. Self-motion has traditionally been studied in psychophysical studies in which a range of wording has been adopted to describe self-motion. However, it is not clear from current research whether the words used in self-motion studies match study participants' own intuitions about the experience of self-motion. We argue that the terminology used in self-motion studies should be drawn from a participant perspective to improve validity. We undertook an online study involving VR self-motion and card-sorting with 50 participants to examine current self-motion terminology. We found that participants were not familiar with the concept of self-motion and that the virtual scene itself might suggest different terminology. We suggest how studies on motion perception in VR should be designed to better reflect participants' vernacular.},
  doi = {10.1145/3461778.3462064},
  isbn = {9781450384766},
  day = {28},
  publicationstatus = {published}
}
@inproceedings{ford2021creativitycomposition,
  author = {Ford, C and Bryan-Kinns, N and Nash, C},
  booktitle = {},
  editor = {Dannenberg, R and Xiao, X},
  month = {Jun},
  organization = {NYU Shanghai, Shanghai.},
  publisher = {https://nime.pubpub.org/pub/ker5w948/release/1},
  title = {Creativity in Children's Digital Music Composition},
  year = {2021},
  abstract = {Composing is a neglected area of music education. To increase participation, many technologies provide open-ended interfaces to motivate child autodidactic use, drawing influence from Papert’s LOGO philosophy to support children’s learning through play. This paper presents a case study examining which interactions with Codetta, a LOGO-inspired, block-based music platform, supports children’s creativity in music composition. Interaction logs were collected from 20 children and correlated against socially-validated creativity scores. To conclude, we recommend that the transition between low-level edits and high-level processes should be carefully scaffolded.},
  startyear = {2021},
  startmonth = {Jun},
  startday = {14},
  finishyear = {2021},
  finishmonth = {Jun},
  finishday = {18},
  keyword = {empirical studies},
  keyword = {creativity},
  keyword = {consensual assessment},
  keyword = {music composition},
  keyword = {music education},
  keyword = {block-based programming},
  keyword = {interaction data},
  keyword = {child-computer interaction},
  conference = {New Instruments for Music Expression},
  day = {14},
  publicationstatus = {accepted}
}
@article{ratclife2021extendedopportunities,
  author = {Ratclife, J and Soave, F and Bryan-Kinns, N and Tokarchuk, L and Farkhatdinov, I},
  journal = {Conference on Human Factors in Computing Systems - Proceedings},
  month = {May},
  title = {Extended reality (xr) remote research: A survey of drawbacks and opportunities},
  year = {2021},
  abstract = {Extended Reality (XR) technology -such as virtual and augmented reality -is now widely used in Human Computer Interaction (HCI), social science and psychology experimentation. However, these experiments are predominantly deployed in-lab with a co-present researcher. Remote experiments, without co-present researchers, have not fourished, despite the success of remote approaches for non-XR investigations. This paper summarises fndings from a 30-item survey of 46 XR researchers to understand perceived limitations and benefts of remote XR experimentation. Our thematic analysis identifes concerns common with non-XR remote research, such as participant recruitment, as well as XR-specifc issues, including safety and hardware variability. We identify potential positive afordances of XR technology, including leveraging data collection functionalities builtin to HMDs (e.g. hand, gaze tracking) and the portability and reproducibility of an experimental setting. We suggest that XR technology could be conceptualised as an interactive technology and a capable data-collection device suited for remote experimentation.},
  doi = {10.1145/3411764.3445170},
  day = {6},
  publicationstatus = {published}
}
@article{nonnis2021ollytogetherness,
  author = {Nonnis, A and Bryan-Kinns, N},
  journal = {International Journal of Human Computer Studies},
  month = {Apr},
  title = {Olly: A tangible for togetherness},
  url = {https://doi.org/10.1016/j.ijhcs.2021.102647},
  volume = {153},
  year = {2021},
  abstract = {This research explores how tangible interactive technology might offer opportunities for socialization and sensory regulation. We present a study carried out in an educational setting during leisure activities with a small group of children with autism who like music. We introduce Όλοι (pronounced Olly), a sonic textile Tangible User Interface (TUI) designed around the observations of five minimally verbal children with autism aged between 5-10 years. The TUI was tested for an average of 24 minutes once per week, over a period of five weeks in a specialized school based in North-East London, UK. We propose a methodological approach that embraces diversity and promotes designs that support repetitive movements and self-regulation to provide the children with a favorable environment and tools to socialize with peers. The findings show positive outcomes with regards to spontaneous social interactions between peers particularly when children interacted with or around Olly. These were observed in the form of eye-contact, turn-taking, sharing (of the space, the object and experience), and more complex social play dynamics like associative and cooperative play. We illustrate how the TUI was a positive stimulus of social behaviors and discuss design implications for novel technologies that aim to foster shared experiences between children with autism.},
  doi = {10.1016/j.ijhcs.2021.102647},
  issn = {1071-5819},
  eissn = {1095-9300},
  day = {6},
  publicationstatus = {accepted}
}
@incollection{daniele2021whatcreativity,
  author = {Daniele, A and Di Bernardi Luft, C and Bryan-Kinns, N},
  booktitle = {},
  month = {Jan},
  pages = {396--411},
  title = {“What Is Human?” A Turing Test for Artistic Creativity},
  volume = {12693 LNCS},
  year = {2021},
  abstract = {This paper presents a study conducted in naturalistic setting with data collected from an interactive art installation. The audience is challenged in a Turing Test for artistic creativity involving recognising human-made versus AI-generated drawing strokes. In most cases, people were able to differentiate human-made strokes above chance. An analysis conducted on the images at the pixel level shows a significant difference between the symmetry of the AI-generated strokes and the human-made ones. However we argue that this feature alone was not key for the differentiation. Further behavioural analysis indicates that people judging more quickly were able to differentiate human-made strokes significantly better than the slower ones. We point to theories of embodiment as a possible explanation of our results.},
  doi = {10.1007/978-3-030-72914-1_26},
  isbn = {9783030729134},
  issn = {0302-9743},
  eissn = {1611-3349},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{ratcliffe2021remoteexperimentation,
  author = {Ratcliffe, J and Soave, F and Hoover, M and Ortega, FR and Bryan-Kinns, N and Tokarchuk, L and Farkhatdinov, I},
  booktitle = {CHI Extended Abstracts},
  editor = {Kitamura, Y and Quigley, A and Isbister, K and Igarashi, T},
  pages = {121:1--121:1},
  publisher = {ACM},
  title = {Remote XR Studies: Exploring Three Key Challenges of Remote XR Experimentation.},
  url = {https://doi.org/10.1145/3411763},
  year = {2021},
  isbn = {978-1-4503-8095-9}
}
@inproceedings{soave2021multisensoryapplications,
  author = {Soave, F and Farkhatdinov, I and Bryan-Kinns, N},
  booktitle = {Proceedings - 2021 IEEE Conference on Virtual Reality and 3D User Interfaces Abstracts and Workshops, VRW 2021},
  month = {Mar},
  pages = {377--379},
  title = {Multisensory teleportation in virtual reality applications},
  year = {2021},
  abstract = {This position paper aims to briefly summarise existing research in vection, teleportation and multisensory stimuli, present our cross-disciplinary research setup and argue towards the importance of discussing haptic feedback design in Virtual Reality (VR) locomotion techniques. In particular, haptic feedback stimulation has been shown to enhance the perception of-self motion when applied to various parts of the body. The recent developments of haptic devices opens the possibilities to explore "whole-body"haptics in virtual environments for locomotion techniques. We argue that crossmodal stimulation frameworks that have been already applied to study self-motion in VR could potentially provide benefits to locomotion studies.},
  doi = {10.1109/VRW52623.2021.00077},
  isbn = {9780738113678},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{liang2021knitsensing,
  author = {Liang, A and Stewart, R and Freire, R and Bryan-Kinns, N},
  booktitle = {TEI 2021 - Proceedings of the 15th International Conference on Tangible, Embedded, and Embodied Interaction},
  month = {Feb},
  title = {Knit Stretch Sensor Placement for Body Movement Sensing},
  year = {2021},
  abstract = {Motion capture technology is widely used in movement-related Human-Computer Interaction, especially in digital arts such as digital dance performance. This paper presents a knit stretch sensor-based dance leotard design to evaluate the locations where the sensors best capture the movement on the body. Two studies are undertaken: (1) interviews to determine user requirements of a dance movement sensing system; (2) evaluation of sensor placement on the body. Ten interviewees including dancers, choreographers, and technologists describe their requirements and expectations for a body movement sensing system. The centre of the body (the torso) is determined to be the area of primary interest for dancers and choreographers to sense movement, and technologists find the robustness of textile sensors the most challenging for textile sensing system design. A dance leotard toile is then designed with sensor groupings on the torso along the direction of major muscles, based on the interviewees' preferred movements to be captured. Each group of the sensors are evaluated by comparing their signal output and a Vicon motion capture system. The evaluation shows sensors which are constantly under tension perform better. For example, sensors on the upper back have a higher success rate than the sensors on the lower back. The dance leotard design was found to capture the movements of standing lean back and standing waist twists the best.},
  doi = {10.1145/3430524.3440629},
  isbn = {9781450382137},
  day = {14},
  publicationstatus = {published}
}
@inproceedings{ratcliffe2021extendedopportunities,
  author = {Ratcliffe, J and Soave, F and Bryan-Kinns, N and Tokarchuk, L and Farkhatdinov, I},
  booktitle = {CHI},
  editor = {Kitamura, Y and Quigley, A and Isbister, K and Igarashi, T and Bjørn, P and Drucker, SM},
  pages = {527:1--527:1},
  publisher = {ACM},
  title = {Extended Reality (XR) Remote Research: a Survey of Drawbacks and Opportunities.},
  url = {https://doi.org/10.1145/3411764},
  year = {2021},
  isbn = {978-1-4503-8096-6}
}
@inproceedings{benitotemprano2021aguitar,
  author = {Benito Temprano, A and Mcpherson, AP},
  booktitle = {},
  month = {Sep},
  organization = {University of Trento (Italy) [Online]},
  title = {A TMR Angle Sensor for Gesture Acquisition and Disambiguation on the Electric Guitar},
  year = {2021},
  abstract = {This paper presents a novel approach to the acquisition of musical gestures on guitar based on Tunneling Magnetoresistance (TMR) sensing. With this minimally invasive setup, tracking of the horizontal displacement of the strings is used to capture gestures related to left and right-hand techniques. A pitch-based calibration is suggested to map the sensed displacement to pitch shifts so that the acquired signals can be directly used to estimate pitch produced by string bending in real-time. Some of the performer’s gestures, despite corresponding to different physical interactions, might produce a similar sonic output, as is the case of upward and downward string bends on the guitar. The proposed technology can be used to disambiguate between these gestures whether that is for automatic transcription purposes or for crafting instrument augmentations that build upon the performer’s existing expertise.},
  startyear = {2021},
  startmonth = {Sep},
  startday = {1},
  finishyear = {2021},
  finishmonth = {Sep},
  finishday = {3},
  conference = {Audio Mostly 2021 (AM'21). Sonic experiences in the era of the Internet of Sounds},
  day = {1},
  publicationstatus = {published}
}
@article{moro2021performerinstrument,
  author = {Moro, G and McPherson, AP},
  journal = {Computer Music Journal},
  month = {Jul},
  number = {2-3},
  pages = {69--91},
  title = {Performer experience on a continuous keyboard instrument},
  volume = {44},
  year = {2021},
  abstract = {On several keyboard instruments the produced sound is not always dependent exclusively on a discrete key-velocity parameter, and minute gestural details can affect the final sonic result. By contrast, variations in articulation beyond velocity have normally no effect on the produced sound when the keyboard controller uses the MIDI standard, used in the vast majority of digital keyboards. In this article, we introduce a novel keyboard-based digital musical instrument that uses continuous readings of key position to control a nonlinear waveguide flute synthesizer with a richer set of interaction gestures than would be possible with a velocity-based keyboard. We then report on the experience of six players interacting with our instrument and reflect on their experience, highlighting the opportunities and challenges that come with continuous key sensing.},
  doi = {10.1162/COMJ_a_00565},
  issn = {0148-9267},
  eissn = {1531-5169},
  day = {27},
  publicationstatus = {published}
}
@inproceedings{reed2021surfacevocalists,
  author = {Reed, CN and McPherson, AP},
  booktitle = {TEI 2021 - Proceedings of the 15th International Conference on Tangible, Embedded, and Embodied Interaction},
  month = {Feb},
  title = {Surface Electromyography for Sensing Performance Intention and Musical Imagery in Vocalists},
  year = {2021},
  abstract = {Through experience, the techniques used by professional vocalists become highly ingrained and much of the fine muscular control needed for healthy singing is executed using well-refined mental imagery. In this paper, we provide a method for observing intention and embodied practice using surface electromyography (sEMG) to detect muscular activation, in particular with the laryngeal muscles. Through sensing the electrical neural impulses causing muscular contraction, sEMG provides a unique measurement of user intention, where other sensors reflect the results of movement. In this way, we are able to measure movement in preparation, vocalised singing, and in the use of imagery during mental rehearsal where no sound is produced. We present a circuit developed for use with the low voltage activations of the laryngeal muscles; in sonification of these activations, we further provide feedback for vocalists to investigate and experiment with their own intuitive movements and intentions for creative vocal practice.},
  doi = {10.1145/3430524.3440641},
  isbn = {9781450382137},
  day = {14},
  publicationstatus = {published}
}
@inproceedings{agrawal2021structureawarenetworks,
  author = {Agrawal, R and Wolff, D and Dixon, S},
  booktitle = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  month = {Jun},
  publisher = {IEEE},
  title = {Structure-Aware Audio-to-Score Alignment Using Progressively Dilated Convolutional Neural Networks},
  year = {2021},
  doi = {10.1109/icassp39728.2021.9414049},
  startyear = {2021},
  startmonth = {Jun},
  startday = {6},
  finishyear = {2021},
  finishmonth = {Jun},
  finishday = {11},
  conference = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  day = {6},
  publicationstatus = {published}
}
@inproceedings{demirel2021lowrecordings,
  author = {Demirel, E and Ahlbäck, S and Dixon, S},
  booktitle = {},
  month = {Jun},
  pages = {586--590},
  title = {Low Resource Audio-To-Lyrics Alignment from Polyphonic Music Recordings},
  volume = {00},
  year = {2021},
  doi = {10.1109/icassp39728.2021.9414395},
  day = {11}
}
@inproceedings{zhang2021cosmiccocreation,
  author = {Zhang, Y and Xia, G and Levy, M and Dixon, S},
  booktitle = {},
  month = {Apr},
  title = {COSMIC: A Conversational Interface for Human-AI Music Co-Creation},
  year = {2021},
  abstract = {In this paper, we propose COSMIC, a COnverSational Interface for Human-AI MusIc Co-Creation. It is a chatbot with a two-fold design philosophy: to understand human creative intent and to help humans in their creation. The core Natural Language Processing (NLP) module is responsible for three functions: 1) understanding human needs in chat, 2) cross-modal interaction between natural language understanding and music generation models, and 3) mixing and coordinating multiple algorithms to complete the composition.},
  startyear = {2021},
  startmonth = {Jul},
  startday = {14},
  finishyear = {2021},
  finishmonth = {Jun},
  finishday = {18},
  conference = {New Interfaces for Musical Expression},
  day = {29},
  publicationstatus = {published}
}
@inproceedings{miller2021discoveringcollections,
  author = {Miller, J and Nicosia, V and Sandler, M},
  booktitle = {ACM International Conference Proceeding Series},
  month = {Jul},
  pages = {93--97},
  title = {Discovering Common Practice: Using Graph Theory to Compare Harmonic Sequences in Musical Audio Collections},
  year = {2021},
  abstract = {In recent decades, rapid technological advances have resulted in a huge quantity of readily accessible digital musical recordings. The scope of large corpora presents difficulties for curators but offers new opportunities to musicologists and music theorists. We propose an application of graph theory which enables comparison of harmonic content from musical audio across collections of recordings. We introduce a graph schema wherein the chord sequences of musical recordings are used to create directed, weighted graphs which represent the underlying harmonic structure of the source material. We believe this application of graph theory offers novel advantages over existing approaches: 1) the relative positions of the chords in the time domain are retained, allowing the graphs to represent entire harmonic sequences of musical material, and 2) sequences from multiple sources are combined into a single graph, exposing features which are common to the source musical material, but which may vary or be absent from any particular instance. To test the schema, graphs were generated from recordings of ĝGeorgia on My Mind'. We were able to produce examples demonstrating how this schema could be used to identify the essential harmonic framework of the song, to gain insight regarding the usage of chord substitutions by an artist during a single performance of the song, and to compare chord choices by two artists representing two different genres.},
  doi = {10.1145/3469013.3469025},
  isbn = {9781450384292},
  day = {28},
  publicationstatus = {published}
}
@article{kirby2021thetransform,
  author = {Kirby, T and Sandler, M},
  journal = {J Acoust Soc Am},
  month = {Jul},
  number = {1},
  organization = {United States},
  pages = {202},
  title = {The evolution of drum modes with strike intensity: Analysis and synthesis using the discrete cosine transform.},
  url = {https://www.ncbi.nlm.nih.gov/pubmed/34340487},
  volume = {150},
  year = {2021},
  abstract = {The synthesis of convincing acoustic drum sounds remains an open problem. In this paper, a method for analysing and synthesising pitch glide in drums is proposed, whereby the discrete cosine transform (DCT) of an unwindowed drum sound is modelled. This is an extension of the scheme initially proposed by Kirby and Sandler [(2020). Proceedings of the 23rd International Conference on Digital Audio Effects, Vienna, Austria, pp. 155-162], which was able to reproduce key components of drum sounds accurately enough that they could not be distinguished from the reference samples. Here, drum modes were analysed in greater detail for a tom-tom struck at 67 different intensities to investigate their evolution with strike velocity. A clear evolution was observed in the DCT features, and interpolation was used to synthesise the modes of intermediate velocity. These synthesised modes were evaluated objectively through null testing, which showed that a continuous blending of strike velocities could be achieved throughout the data set. An AB listening test was also performed, where 20 participants attempted to distinguish between pairs of real and synthesised sounds. Exactly 50\% accuracy was achieved overall, which demonstrates that the synthesised samples were deemed to sound as realistic as genuine samples. These results demonstrate that the DCT representation is a valuable framework for analysis and synthesis of drum sounds. It is also likely that this approach could be applied to other instruments.},
  doi = {10.1121/10.0005509},
  eissn = {1520-8524},
  keyword = {Acoustics},
  keyword = {Auditory Perception},
  keyword = {Humans},
  keyword = {Sound},
  language = {eng},
  publicationstatus = {published}
}
@inproceedings{shukla2021userreality,
  author = {Shukla, R and Stewart, R and Sandler, M},
  booktitle = {https://zenodo.org/record/5054146},
  month = {Jun},
  organization = {Online},
  title = {User HRTF Selection for 3D Auditory Mixed Reality},
  year = {2021},
  abstract = {We introduce a novel approach for personalisation of an efficient 3D binaural rendering system designed for mobile, auditory mixed reality use cases. A head-related transfer function (HRTF) ranking method is outlined for users of real-time, interactive sound and music applications. Twenty participants tested the approach and its impact on their capacity to locate a continuous musical sound rendered in varying 3D positions. Analysis of HRTF rankings across three separate sessions reveal encouraging levels of reliability amongst some participants. Patterns of interaction show a significant benefit to horizontal precision that results from the selection process. In contrast, length of system exposure (rather than HRTF preference) demonstrates a significant degree of improvement to aspects of vertical perception and overall speed of response, with no detriment to horizontal accuracy. These findings provide an initial basis from which to consider priorities in the design of audio-only immersive applications and accompanying methods for effective user controlled personalisation.},
  doi = {10.5281/zenodo.5045168},
  startyear = {2021},
  startmonth = {Jun},
  startday = {29},
  finishyear = {2021},
  finishmonth = {Jul},
  finishday = {1},
  keyword = {audio augmented reality},
  keyword = {HRTF selection},
  keyword = {binaural rendering},
  keyword = {spatial sonic interaction},
  conference = {Sound and Music Computing Conference},
  day = {30},
  publicationstatus = {published}
}
@article{ohanlon2021fifthnetrecognition,
  author = {O'Hanlon, K and Sandler, M},
  journal = {IEEE/ACM Transactions on Audio Speech and Language Processing},
  month = {Jan},
  pages = {2671--2682},
  title = {FifthNet: Structured Compact Neural Networks for Automatic Chord Recognition},
  volume = {29},
  year = {2021},
  abstract = {Deep learning has become popular for many music processing tasks with Convolutional Neural Networks (CNNs) often applied. CNNs can be computationally expensive, a problem that may be alleviated through design of compact network elements or by compressing trained networks. CNNs assemble high-level structure in a hierarchical fashion, starting from small simple local patterns. On the other hand, much structure found in music spectra, such as harmonicity, is already well-defined. Both signal representations and processing methods have previously exploited such structure. We propose FifthNet, a compact neural network that is applied to the task of Automatic Chord Recognition (ACR). The compactness of FifthNet is effected through exploiting known data structure; first by arranging the network inputs according to expected data structures, then by separating processing of the semantically meaningful dimensions of the data. FifthNet is then seen to perform similar to a state-of-the-art CNN for ACR while employing only a small percentage of the parameters and computational expense used by the CNN.},
  doi = {10.1109/TASLP.2021.3070158},
  issn = {2329-9290},
  eissn = {2329-9304},
  day = {1},
  publicationstatus = {published}
}
@article{colonel2021reverseprocessing,
  author = {Colonel, JT and Reiss, J},
  journal = {Journal of the Acoustical Society of America},
  month = {Jul},
  number = {1},
  pages = {608--619},
  title = {Reverse engineering of a recording mix with differentiable digital signal processing},
  volume = {150},
  year = {2021},
  abstract = {A method to retrieve the parameters used to create a multitrack mix using only raw tracks and the stereo mixdown is presented. This method is able to model linear time-invariant effects such as gain, pan, equalisation, delay, and reverb. Nonlinear effects, such as distortion and compression, are not considered in this work. The optimization procedure used is the stochastic gradient descent with the aid of differentiable digital signal processing modules. This method allows for a fully interpretable representation of the mixing signal chain by explicitly modelling the audio effects rather than using differentiable blackbox modules. Two reverb module architectures are proposed, a “stereo reverb” model and an “individual reverb” model, and each is discussed. Objective feature measures are taken of the outputs of the two architectures when tasked with estimating a target mix and compared against a stereo gain mix baseline. A listening study is performed to measure how closely the two architectures can perceptually match a reference mix when compared to a stereo gain mix. Results show that the stereo reverb model performs best on objective measures and there is no statistically significant difference between the participants' perception of the stereo reverb model and reference mixes.},
  doi = {10.1121/10.0005622},
  issn = {0001-4966},
  eissn = {1520-8524},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{steinmetz2021pyloudnormpython,
  author = {Steinmetz, CJ and Reiss, JD},
  booktitle = {150th Audio Engineering Society Convention, AES 2021},
  month = {Jan},
  title = {Pyloudnorm: A simple yet flexible loudness meter in python},
  year = {2021},
  abstract = {The ITU-R BS.1770 recommendation for measuring the perceived loudness of audio signals has seen widespread adoption in broadcasting. Due to its simplicity, this algorithm has now found applications across audio signal processing. Here we describe pyloudnorm, a Python package that enables the measurement of integrated loudness following the recommendation. While a number of implementations are available, ours provides an easy-to-install package, a simple interface, and the ability to adjust the algorithm parameters, a feature that others neglect. We outline the design of pyloudnorm and discuss a set of modifications based upon recent literature that improve the robustness of loudness measurements. We perform an evaluation comparing accuracy and runtime with six other implementations, demonstrating that pyloudnorm is both fully compliant and one of the fastest options.},
  isbn = {9781713830672},
  day = {1},
  publicationstatus = {published}
}
@inproceedings{reiss2021atechniques,
  author = {Reiss, JD and Tez, HE and Selfridge, R},
  booktitle = {150th Audio Engineering Society Convention, AES 2021},
  month = {Jan},
  title = {A comparative perceptual evaluation of thunder synthesis techniques},
  year = {2021},
  abstract = {The sound of thunder is widely used in game, film and virtual reality sound design. It is also a phenomenon for which we seek a better understanding of the physics underlying the sound. Though many models of thunder have been proposed, there has not yet been a formal perceptual evaluation of the models to assess their realism and sound quality. Here, we present and evaluate the implementation of several thunder sound effect synthesis models. The models include different physical modeling and signal-based approaches, as well as a recorded sample. Evaluation was with over 50 participants. The results showed that none of the models were close to the recording in terms of realism, though signal-based models slightly outperformed the physical models. This highlights the need for comparative perceptual evaluation in sound synthesis, and identifies the limitations of current thunder simulation approaches.},
  isbn = {9781713830672},
  day = {1},
  publicationstatus = {published}
}

This file was generated by bibtex2html 1.96.