@inproceedings{cardinale2022neoriemannian, author = {Cardinale, S and Colton, S}, booktitle = {Proceedings of the International Conference on Computational Creativity}, title = {Neo-Riemannian Theory for Generative Film and Videogame Music}, month = {Jun}, year = {2022} }
@article{aziz2022planningoverviews, author = {Aziz, N and Stockman, T and Stewart, R}, journal = {ACM Transactions on Accessible Computing}, month = {Apr}, publisher = {Association for Computing Machinery (ACM)}, title = {Planning Your Journey in Audio: Design and Evaluation of Auditory Route Overviews}, year = {2022}, doi = {10.1145/3531529}, issn = {1936-7236}, day = {27}, publicationstatus = {published} }
@misc{preniqi2022morelyrics, author = {Preniqi, V and Kalimeri, K and Saitis, C}, month = {Sep}, title = {"More Than Words": Linking Music Preferences and Moral Values Through Lyrics}, year = {2022}, day = {2} }
@misc{hayes2022disembodiedsynthesis1, author = {Hayes, B and Saitis, C and Fazekas, G}, month = {Aug}, title = {Disembodied Timbres: a Study on Semantically Prompted FM Synthesis}, year = {2022}, doi = {10.31234/osf.io/ksw5j}, day = {5} }
@article{hayes2022disembodiedsynthesis2, author = {Hayes, B and Saitis, C and Fazekas, G}, journal = {Journal of the Audio Engineering Society}, month = {May}, number = {5}, pages = {373--391}, publisher = {Audio Engineering Society}, title = {Disembodied Timbres: A Study on Semantically Prompted FM Synthesis}, volume = {70}, year = {2022}, abstract = {Disembodied electronic sounds constitute a large part of the modern auditory lexicon, but research into timbre perception has focused mostly on the tones of conventional acoustic musical instruments. It is unclear whether insights from these studies generalize to electronic sounds, nor is it obvious how these relate to the creation of such sounds. This work presents an experiment on the semantic associations of sounds produced by FM synthesis with the aim of identifying whether existing models of timbre semantics are appropriate for such sounds. A novel experimental paradigm, in which experienced sound designers responded to semantic prompts by programming a synthesizer, was applied, and semantic ratings on the sounds they created were provided. Exploratory factor analysis revealed a five-dimensional semantic space. The first two factors mapped well to the concepts of luminance, texture, and mass. The remaining three factors did not have clear parallels, but correlation analysis with acoustic descriptors suggested an acoustical relationship to luminance and texture. The results suggest that further inquiry into the timbres of disembodied electronic sounds, their synthesis, and their semantic associations would be worthwhile and that this could benefit research into auditory perception and cognition and synthesis control and audio engineering.}, doi = {10.17743/jaes.2022.0006}, issn = {0004-7554}, day = {11}, publicationstatus = {published} }
@misc{delgado2022deepclassification, author = {Delgado, A and Demirel, E and Subramanian, V and Saitis, C and Sandler, M}, month = {Apr}, title = {Deep Embeddings for Robust User-Based Amateur Vocal Percussion Classification}, year = {2022}, doi = {10.48550/arxiv.2204.04646}, day = {10} }
@misc{delgado2022deepvocalisation, author = {Delgado, A and Saitis, C and Benetos, E and Sandler, M}, month = {Apr}, title = {Deep Conditional Representation Learning for Drum Sample Retrieval by Vocalisation}, year = {2022}, doi = {10.48550/arxiv.2204.04651}, keyword = {Clinical Research}, day = {10} }
@article{hayes2022disembodiedsynthesis3, author = {Hayes, B and Saitis, C and Fazekas, G}, journal = {Journal of the Audio Engineering Society}, month = {May}, number = {5}, pages = {373--391}, publisher = {Audio Engineering Society}, title = {Disembodied Timbres: A Study on Semantically Prompted FM Synthesis}, volume = {70}, year = {2022}, abstract = {Disembodied electronic sounds constitute a large part of the modern auditory lexicon, but research into timbre perception has focused mostly on the tones of conventional acoustic musical instruments. It is unclear whether insights from these studies generalize to electronic sounds, nor is it obvious how these relate to the creation of such sounds. This work presents an experiment on the semantic associations of sounds produced by FM synthesis with the aim of identifying whether existing models of timbre semantics are appropriate for such sounds. A novel experimental paradigm, in which experienced sound designers responded to semantic prompts by programming a synthesizer, was applied, and semantic ratings on the sounds they created were provided. Exploratory factor analysis revealed a five-dimensional semantic space. The first two factors mapped well to the concepts of luminance, texture, and mass. The remaining three factors did not have clear parallels, but correlation analysis with acoustic descriptors suggested an acoustical relationship to luminance and texture. The results suggest that further inquiry into the timbres of disembodied electronic sounds, their synthesis, and their semantic associations would be worthwhile and that this could benefit research into auditory perception and cognition and synthesis control and audio engineering.}, doi = {10.17743/jaes.2022.0006}, issn = {0004-7554}, day = {11}, publicationstatus = {published} }
@article{turchet2022theontology, author = {Turchet, L and Bouquet, P and Molinari, A and Fazekas, G}, journal = {Journal of Web Semantics}, month = {Apr}, title = {The Smart Musical Instruments Ontology}, volume = {72}, year = {2022}, abstract = {The Smart Musical Instruments (SMIs) are an emerging category of musical instruments that belongs to the wider class of Musical Things within the Internet of Musical Things paradigm. SMIs encompass sensors, actuators, embedded intelligence, and wireless connectivity to local networks and to the Internet. Interoperability represents a key issue within this domain, where heterogeneous SMIs are envisioned to exchange information between each other and a plethora of Musical Things. This paper proposes an ontology for the representation of the knowledge related to SMIs, with the aim of facilitating interoperability between SMIs as well as with other Musical Things interacting with them. There was no previous comprehensive data model for the SMIs domain, however the new ontology relates to existing ontologies, including the SOSA Ontology for the representation of sensors and actuators, the Audio Effects Ontology dealing with the description of digital audio effects, and the IoMusT Ontology for the representation Musical Things and IoMusT ecosystems. This paper documents the design of the ontology and its evaluation with respect to specific requirements gathered from an extensive literature review, which was based on scenarios involving SMIs stakeholders, such as performers and studio producers. The SMI Ontology can be accessed at: https://w3id.org/smi#.}, doi = {10.1016/j.websem.2021.100687}, issn = {1570-8268}, day = {1}, publicationstatus = {published} }
@incollection{frachi2022designbiosignals, author = {Frachi, Y and Takahashi, T and Wang, F and Barthet, M}, conference = {}, month = {Jan}, pages = {160--179}, title = {Design of Emotion-Driven Game Interaction Using Biosignals}, volume = {13334 LNCS}, year = {2022}, abstract = {Video games can evoke a wide range of emotions in players through multiple modalities. However, on a broader scale, human emotions are probably an important missing part of the current generation of Human Computer Interaction (HCI). The main goal of this project is to start investigating how to design video games where the game mechanics and interactions are based on the player’s emotions. We designed a two-dimensional (2D) storytelling game prototype with Unity. Game designers and creators manage the user’s experience and emotions along the play through visual effects, sound effects, controls and narration. In particular for this project, we have chosen to create emotionally-driven interactions for two specific aspects: sound (audio effects, music), and narration (storytelling). Our prototype makes use of the Ovomind smart band and biosignals analysis technology developed by the first author. By wearing the smart band, human body physiological information are extracted and classified using signal processing method into groups of emotions mapped to the arousal \& valence (AV) plane. The 2D AV emotion representation is directly used as an interactive input into the game interaction system. Regarding music, we propose a system that automatically arranges background music by inputting emotions analysed by the smart band into an AI model. We evaluated the results using video recordings of the experience and collected feedback from a total of 30 participants. The results show that participants are favorable to narrative and music game adaptations based on real-time player emotion analysis. Some issues were also highlighted e.g. around the coherence of game progression. Participants also felt that the background music arrangements matched the player’s emotions well. Further experiments are required and planned to assess whether the prospects expressed by participants match their personal experience when playing the emotion-driven game.}, doi = {10.1007/978-3-031-05637-6_10}, isbn = {9783031056369}, issn = {0302-9743}, eissn = {1611-3349}, day = {1}, publicationstatus = {published} }
@article{wu2022exploringinterfaces, author = {Wu, Y and Bryan-Kinns, N and Zhi, J}, journal = {Journal on Multimodal User Interfaces}, month = {Sep}, number = {3}, pages = {343--356}, title = {Exploring visual stimuli as a support for novices’ creative engagement with digital musical interfaces}, volume = {16}, year = {2022}, abstract = {Visual materials are a widely used tool for stimulating creativity. This paper explores the potential for visual stimuli to support novices’ creative engagement with multimodal digital musical interfaces. An empirical study of 24 participants was conducted to compare the effect of abstract and literal forms of graphical scores on novices’ creative engagement, and whether being informed or uninformed about meanings of symbols in the score had any impact on creative engagement. The results suggest that abstract visual stimuli can provide an effective scaffold for creative engagement when participants are not informed about their design. It was found that providing information about visual stimuli has both advantages and disadvantages, depending largely on the stimuli’s visual style. Being informed about the meaning of a literal visual stimuli helped participants in making interpretations and gaining inspiration, whereas having information about abstract stimuli led to frustration. Qualitative data indicates that both forms of visual stimuli support creative engagement but at different stages of a creative process, and a descriptive model is presented to explain this. The findings highlight the benefits of visual stimuli in supporting creative engagement in the process of music making – a multimodal interaction domain typically involving few or no visual activities.}, doi = {10.1007/s12193-022-00393-3}, issn = {1783-7677}, eissn = {1783-8738}, day = {1}, publicationstatus = {published} }
@inproceedings{robson2022beingsensors, author = {Robson, N and McPherson, A and Bryan-Kinns, N}, conference = {}, month = {Jun}, title = {Being With The Waves: An Ultrasonic Art Installation Enabling Rich Interaction Without Sensors}, year = {2022}, doi = {10.21428/92fbeb44.376bc758}, booktitle = {NIME 2022}, day = {28} }
@inproceedings{ford2022identifyinghome, address = {New York, NY, USA}, author = {Ford, C and Bryan-Kinns, N}, conference = {Creativity and Cognition}, month = {Jun}, organization = {Venice, Italy}, pages = {443--456}, publisher = {ACM Digital Library}, title = {Identifying Engagement in Children's Interaction whilst Composing Digital Music at Home}, url = {https://dl.acm.org/doi/10.1145/3527927.3532794}, year = {2022}, abstract = {Identifying points of engagement from a person’s interaction with computers could be used to assess their experience and to adapt user interfaces in real-time. However, it is difficult to identify points of engagement unobtrusively; HCI studies typically use retrospective protocols or rely on cumbersome sensors for real-time analysis. We present a case study on how children compose digital music at home in which we remotely identify points of engagement from patterns of interaction with a musical interface. A mixed-methods approach is contributed in which video recordings of children’s interactions whilst composing are labelled for engagement and linked to i) interaction logs from the interface to identify indicators of engagement in interaction, and ii) interview data gathered using a remote video-cued recall technique to understand the experiential qualities of engaging interactions directly from users. We conclude by speculating on how the suggested indicators of engagement inform the design of adaptive music systems.}, doi = {10.1145/3527927.3532794}, startyear = {2022}, startmonth = {Jun}, startday = {20}, finishyear = {2022}, finishmonth = {Jun}, finishday = {23}, isbn = {9781450393270}, keyword = {adaptive systems}, keyword = {children}, keyword = {composition}, keyword = {creativity}, keyword = {creativity support tools}, keyword = {engagement}, keyword = {flow}, keyword = {music}, keyword = {novice}, keyword = {online}, keyword = {remote}, booktitle = {ACM Conference on Creativity \& Cognition}, day = {20}, publicationstatus = {online-published} }
@inproceedings{zheng2022squeezeprototypes, author = {Zheng, J and Bryan-Kinns, N}, conference = {}, month = {Jun}, title = {Squeeze, Twist, Stretch: Exploring Deformable Digital Musical Interfaces Design Through Non-Functional Prototypes}, year = {2022}, doi = {10.21428/92fbeb44.41da9da5}, booktitle = {NIME 2022}, day = {16}, publicationstatus = {published} }
@inproceedings{zheng2022materialdesign, author = {Zheng, J and Bryan-Kinns, N and McPherson, AP}, conference = {}, month = {Jun}, pages = {976--986}, title = {Material Matters: Exploring Materiality in Digital Musical Instruments Design}, year = {2022}, doi = {10.1145/3532106.3533523}, booktitle = {Designing Interactive Systems Conference}, day = {13}, publicationstatus = {published} }
@inproceedings{zhang2022integratingdisciplines, author = {Zhang, M and Stewart, R and Bryan-Kinns, N}, conference = {DIS 2022 - Proceedings of the 2022 ACM Designing Interactive Systems Conference: Digital Wellbeing}, month = {Jun}, pages = {1277--1287}, title = {Integrating Interactive Technology Concepts With Material Expertise in Textile Design Disciplines}, year = {2022}, abstract = {Textile and fashion designers are increasingly interested in integrating interactive technologies into their practice. However, traditional design education typically lacks support for them to develop technical digital and electronics skills alongside their expertise in materials. Reflecting on outputs from an e-textile design workshop and 8-week design projects with four textile design students using an e-textile toolkit, and follow-up data collection with the students one year after the projects, we argue that starting technical explorations with raw materials results in a better understanding and more flexible use of technical knowledge. We also argue that this newly acquired knowledge is then more fully integrated with their pre-existing material knowledge as it is applied to physical interface design. The results contribute to the development of tools and approaches in supporting designers with material expertise to learn tangible interaction design skills.}, doi = {10.1145/3532106.3533535}, isbn = {9781450393584}, day = {13}, publicationstatus = {published} }
@article{bryankinns2022qi2heepistemology, author = {Bryan-Kinns, N and Wang, W and Ji, T}, journal = {International Journal of Human Computer Studies}, month = {Apr}, title = {Qi2He: A co-design framework inspired by eastern epistemology}, volume = {160}, year = {2022}, abstract = {The rapid development of rural societies mixed with the infrastructural transformation of emerging economies bring both challenges and opportunities to Human-Computer Interaction (HCI) design as illustrated through the emergence of the field of HCI for Development (HCI4D). A key challenge for HCI4D is how local knowledge, expertise, and culture can be constructively combined with global trends in digital innovation and socioeconomic development. Co-design and participatory design practices in HCI offer opportunities to engage diverse communities in design activities which embrace both transition and tradition in constructive ways. We present our co-design framework, Qi2He, which supports designers and local communities engaging in co-design activities. Qi2He is inspired by traditional Chinese epistemology and contributes (i) methods to support cross-cultural co-design engagement, and (ii) post-hoc critique of co-design participation. We illustrate the use of Qi2He through three case studies of HCI design over four years in rural China where local culture and traditions are in a state of flux from waves of migration to cities whilst also being an integral part of the broader national and global transformation. The first case study examines how local rural knowledge can be shared and acquired to create a design system for ethnic brocade production. The second case study explores how the creation of an interactive drama can be used as a driver for rural community engagement. The third case study focusses on the iterative design of cross-cultural interactive product innovation. We conclude by reflecting on lessons we learnt when structuring and restructuring our co-design process and offer suggestions for how our Qi2He framework could be used by others and in different cultural settings.}, doi = {10.1016/j.ijhcs.2022.102773}, issn = {1071-5819}, eissn = {1095-9300}, day = {1}, publicationstatus = {published} }
@inproceedings{fordspeculatingai, author = {Ford, C and Bryan-Kinns, N}, conference = {}, title = {Speculating on Reflection and People’s Music Co-Creation with AI}, year = {}, startyear = {2022}, startmonth = {May}, startday = {10}, finishyear = {2022}, finishmonth = {May}, finishday = {10}, booktitle = {Generative AI and HCI Workshop at CHI 2022}, publicationstatus = {accepted} }
@article{robson2022onpractitioners, author = {Robson, N and Bryan-Kinns, N and Mcpherson, A}, journal = {Organised Sound: an international journal of music and technology}, month = {Feb}, number = {1}, publisher = {Cambridge University Press (CUP)}, title = {On mediating space, sound and experience: interviews with situated sound art practitioners}, volume = {28}, year = {2022}, abstract = {This article reports on an interview-based study with ten sound artists and composers, all engaged in situated sonic practices. We propose that these artists engage the ear and shape possible interactions with the artwork by altering the relationship between sound, the space in which it is heard and the people who hear it. Our interviews probe the creative process and explore how a sound artist’s methods and tools might influence the reception of their work. A thematic analysis of interview transcriptions leads us to characterise artist processes as mediatory, in the sense that they act in-between site and audience experience and are guided by the nonhuman agencies of settings and material things. We propose that artists transfer their own situated and embodied listening to that of the audience and develop sonic and staging devices to direct perceptual activity and listening attention. Our findings also highlight a number of engagement challenges, in particular the difficulty artists face in understanding their audience’s experience and the specificity of an artwork’s effect to not just its location, but to the disposition, abilities and prior experiences of listeners.}, doi = {10.1017/S1355771822000103}, issn = {1355-7718}, day = {9}, publicationstatus = {published} }
@inproceedings{zhang2022qiaolevr, author = {Zhang, J and Bryan-Kinns, N}, conference = {Proceedings - 2022 IEEE Conference on Virtual Reality and 3D User Interfaces Abstracts and Workshops, VRW 2022}, month = {Jan}, pages = {357--362}, title = {QiaoLe: Accessing Traditional Chinese Musical Instruments in VR}, year = {2022}, abstract = {Virtual Reality (VR) offers the potential for more engaging access to Intangible Cultural Heritage. We present the design of a VR system (QiaoLe) in which people can access and learn about traditional Chinese musical instruments. We undertook a user study of QiaoLe (24 participants) comparing three interaction modes. Results suggest that embodied interaction and gamification improved users' expe-rience, presence, and enjoyment in QiaoLe, but gamification may distract from rote learning.}, doi = {10.1109/VRW55335.2022.00080}, isbn = {9781665484022}, day = {1}, publicationstatus = {published} }
@incollection{soave2022designingreality, author = {Soave, F and Bryan-Kinns, N and Farkhatdinov, I}, conference = {}, month = {Jan}, pages = {92--101}, title = {Designing Audio Feedback to Enhance Motion Perception in Virtual Reality}, volume = {13417 LNCS}, year = {2022}, abstract = {We present our study on the design and evaluation of sound samples for motion perception in a Virtual Reality (VR) application. In previous study we found our sound samples to be incoherent with the VR visual channel. In current research we designed four new samples and tested them adapting standard subjective evaluation protocols to our needs. Twenty participants participated to the study and rated each animation in Realism, Matching and Plausibility. Significant differences were found among the sounds and discussion rose on the need for realism in VR applications as well as users’ expectation and how it could influence their experience.}, doi = {10.1007/978-3-031-15019-7_9}, isbn = {9783031150180}, issn = {0302-9743}, eissn = {1611-3349}, day = {1}, publicationstatus = {published} }
@inproceedings{lepri2022uselesspractice, author = {Lepri, G and Mcpherson, A and Bowers, J}, conference = {}, month = {Jul}, title = {Useless, not Worthless: Absurd Making as Critical Practice}, year = {2022}, abstract = {We report on the outcomes of a hackathon organised aroundthe themes of absurd musical interfaces, questionable sonicinteractions and unworkable music designs. At the core of theproject is the intention to explore absurd making as a way tosupport critical and disruptive design practices. We reflect onhow surreal, nonsensical and fragile artefacts can be helpfulto stretch and critique conventional ideas of what is useful andappropriate in technology research and development. Afterintroducing both concepts and methods that shaped the eventwe present a selection of useless interfaces designed by thehackathon’s attendees. These musical artefacts, and the con-siderations around them, are then discussed as a viable meansfor communicating both design concerns and future visions.We also consider two features identified as playing a crucialrole within the event: the discovery of contradictions and theimportance of context-based ingredients.}, doi = {10.1145/10.1145/3357236.3395547}, booktitle = {ACM conference on Designing Interactive Systems}, day = {1}, publicationstatus = {accepted} }
@inproceedings{mice2022theperformances, address = {Auckland, New Zealand}, author = {Mice, L and Mcpherson, A}, conference = {}, month = {Jun}, organization = {Auckland, New Zealand}, publisher = {International Conference on New Interfaces for Musical Expression}, title = {The M in NIME: Motivic analysis and the case for a musicology of NIME performances}, url = {https://www.nime.org/}, year = {2022}, abstract = {While the value of new digital musical instruments lies to a large extent in their music making capacity, analyses of new instruments in the research literature often focus on analyses of gesture or performer experience rather than the content of the music made with the instrument. In this paper we present a motivic analysis of music made with new instruments. In the context of music, a motive is a small, analysable musical fragment or phrase that is important in or characteristic of a composition. We outline our method for identifying and analysing motives in music made with new instruments, and display its use in a case study in which 10 musicians created performances with a new large-scale digital musical instrument that we designed. This research illustrates the value of a musicological approach to NIME research, suggesting the need for a broader conversation about a musicology of NIME performances, as distinct from its instruments.}, startyear = {2022}, startmonth = {Jun}, startday = {28}, finishyear = {2022}, finishmonth = {Jul}, finishday = {1}, keyword = {Motivic analysis}, keyword = {digital musical instrument design}, keyword = {musical interaction}, keyword = {large DMI}, keyword = {musicology of NIME performance}, keyword = {music theory}, keyword = {DMI evaluation}, booktitle = {International Conference on New Interfaces for Musical Expression}, day = {28}, publicationstatus = {accepted} }
@inproceedings{lepri2022thespeculation, author = {Lepri, G and Bowers, J and Topley, S and Stapleton, P and Bennett, P and Andersen, K and McPherson, A}, conference = {}, month = {Jun}, title = {The 10,000 Instruments Workshop - (Im)practical Research for Critical Speculation}, year = {2022}, doi = {10.21428/92fbeb44.9e7c9ba3}, booktitle = {NIME 2022}, day = {28} }
@inproceedings{guidi2022quantitativeinstruments, author = {Guidi, A and McPherson, A}, conference = {}, month = {Jun}, title = {Quantitative evaluation of aspects of embodiment in new digital musical instruments}, year = {2022}, doi = {10.21428/92fbeb44.79d0b38f}, booktitle = {NIME 2022}, day = {28} }
@inproceedings{reed2022exploringmicrophenomenology, author = {Reed, CN and Nordmoen, C and Martelloni, A and Lepri, G and Robson, N and Zayas-Garin, E and Cotton, K and Mice, L and McPherson, A}, conference = {}, month = {Jun}, title = {Exploring Experiences with New Musical Instruments through Micro-phenomenology}, year = {2022}, doi = {10.21428/92fbeb44.b304e4b1}, keyword = {Clinical Research}, booktitle = {NIME 2022}, day = {28} }
@inproceedings{pelinski2022embeddedopportunities, author = {Pelinski, T and Shepardson, V and Symons, S and Caspe, FS and Benito Temprano, AL and Armitage, J and Kiefer, C and Fiebrink, R and Magnusson, T and McPherson, A}, conference = {}, month = {Jun}, title = {Embedded AI for NIME: Challenges and Opportunities}, year = {2022}, doi = {10.21428/92fbeb44.76beab02}, booktitle = {NIME 2022}, day = {28} }
@inproceedings{zayasgarin2022dialogicexperience, author = {Zayas-Garin, E and McPherson, A}, conference = {}, month = {Jun}, title = {Dialogic Design of Accessible Digital Musical Instruments: Investigating Performer Experience}, year = {2022}, doi = {10.21428/92fbeb44.2b8ce9a4}, booktitle = {NIME 2022}, day = {28} }
@inproceedings{nordmoen2022makingsystem, author = {Nordmoen, C and McPherson, AP}, conference = {DIS 2022 - Proceedings of the 2022 ACM Designing Interactive Systems Conference: Digital Wellbeing}, month = {Jun}, pages = {415--423}, title = {Making space for material entanglements: A diffractive analysis of woodwork and the practice of making an interactive system}, year = {2022}, abstract = {A shift in perspective is underway in design research and human-computer interaction (HCI) from humans as the centre of attention to considering complex assemblages of human and non-human stakeholders. While this shift is often approached from a broad ecological level, there is opportunity for a more local shift in understanding our day to day meeting with the material world. Drawing on the posthuman theories of Karen Barad, we explore the creation of a digital interactive system as a material-discursive practice in which matter and culture are inseparably entangled. We seek a fresh look at the process rather than the outcome of interactive system design through a diffractive reading of four traditional woodworking practices and an auto-ethnographic account of the development of a digital sensor and actuator apparatus as a way to find alternative ways of attending to materials in HCI.}, doi = {10.1145/3532106.3533572}, isbn = {9781450393584}, day = {13}, publicationstatus = {published} }
@inproceedings{mice2022superdesign, author = {Mice, L and McPherson, AP}, conference = {Conference on Human Factors in Computing Systems - Proceedings}, month = {Apr}, title = {Super Size Me: Interface Size, Identity and Embodiment in Digital Musical Instrument Design}, year = {2022}, abstract = {Digital interfaces are shrinking, driven by pressures of mass production and consumer culture, and often accompanied by a discourse of control, precision or convenience. Meanwhile, human bodies remain the same size, and the changing size of interfaces has implications for the formation of user identities. Drawing on embodied cognition, effort and entanglement theories of HCI, we explored the impact of interface size on the co-constitution of humans and technology. We designed an oversized digital musical instrument and invited musicians to use the instrument to create original performances. We found that both the performances and the musicians' self-perception were influenced by the large size of the instrument, shining new light on the ways in which designing technology is designing humans and in turn culture.}, doi = {10.1145/3491102.3517626}, isbn = {9781450391573}, day = {29}, publicationstatus = {published} }
@inproceedings{reed2022singingperformances, author = {Reed, CN and Skach, S and Strohmeier, P and McPherson, AP}, conference = {ACM International Conference Proceeding Series}, month = {Mar}, pages = {170--183}, title = {Singing Knit: Soft Knit Biosensing for Augmenting Vocal Performances}, year = {2022}, abstract = {This paper discusses the design of the Singing Knit, a wearable knit collar for measuring a singer's vocal interactions through surface electromyography. We improve the ease and comfort of multi-electrode bio-sensing systems by adapting knit e-textile methods. The goal of the design was to preserve the capabilities of rigid electrode sensing while addressing its shortcomings, focusing on comfort and reliability during extended wear, practicality and convenience for performance settings, and aesthetic value. We use conductive, silver-plated nylon jersey fabric electrodes in a full rib knit accessory for sensing laryngeal muscular activation. We discuss the iterative design and the material decision-making process as a method for building integrated soft-sensing wearable systems for similar settings. Additionally, we discuss how the design choices through the construction process reflect its use in a musical performance context.}, doi = {10.1145/3519391.3519412}, isbn = {9781450396325}, day = {13}, publicationstatus = {published} }
@inproceedings{luo2022towardsaudio, author = {Luo, Y-J and Ewert, S and Dixon, S}, conference = {}, month = {Jul}, pages = {3299--3305}, title = {Towards Robust Unsupervised Disentanglement of Sequential Data — A Case Study Using Music Audio}, year = {2022}, doi = {10.24963/ijcai.2022/458}, booktitle = {Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence}, day = {1} }
@article{proutskova2022thejazz, author = {Proutskova, P and Wolff, D and Fazekas, G and Frieler, K and Höger, F and Velichkina, O and Solis, G and Weyde, T and Pfleiderer, M and Crayencour, HC and Peeters, G and Dixon, S}, journal = {Journal of Web Semantics}, month = {Jul}, pages = {100735--100735}, title = {The Jazz Ontology: A semantic model and large-scale RDF repositories for jazz}, year = {2022}, doi = {10.1016/j.websem.2022.100735}, issn = {1570-8268}, day = {1}, publicationstatus = {published} }
@inproceedings{stollerwaveunetseparation, author = {STOLLER, D and EWERT, S and DIXON, S}, conference = {}, title = {Wave-U-Net: A Multi-Scale Neural Network for End-to-End Audio Source Separation}, year = {}, booktitle = {19th International Society for Music Information Retrieval Conference (ISMIR)}, publicationstatus = {accepted} }
@inproceedings{sarkar2022ensemblesetseparation, author = {Sarkar, S and Benetos, E and Sandler, M}, conference = {}, month = {Dec}, organization = {Bangalore}, title = {EnsembleSet: A new high-quality synthesised dataset for chamber ensemble separation}, year = {2022}, abstract = {Music source separation research has made great advances in recent years, especially towards the problem of separating vocals, drums, and bass stems from mastered songs. The advances in this field can be directly attributed to the availability of large-scale multitrack research datasets for these mentioned stems. Tasks such as separating similar-sounding sources from an ensemble recording have seen limited research due to the lack of sizeable, bleed-free multitrack datasets. In this paper, we introduce a novel multitrack dataset called EnsembleSet generated using the Spitfire BBC Symphony Orchestra library using ensemble scores from RWC Classical Music Database and Mutopia. Our data generation method introduces automated articulation mapping for different playing styles based on the input MIDI/MusicXML data. The sample library also enables us to render the dataset with 20 different mix/microphone configurations allowing us to study various recording scenarios for each performance. The dataset presents 80 tracks (6+ hours) with a range of string, wind, and brass instruments arranged as chamber ensembles. We also present our benchmark on our synthesised dataset using a permutation-invariant time-domain separation model for chamber ensembles which produces generalisable results when tested on real recordings from existing datasets.}, startyear = {2022}, startmonth = {Dec}, startday = {5}, finishyear = {2022}, finishmonth = {Dec}, finishday = {8}, booktitle = {International Society for Music Information Retrieval}, day = {8}, publicationstatus = {accepted} }
@misc{caspe2022ddx7sounds, author = {Caspe, F and McPherson, A and Sandler, M}, month = {Aug}, title = {DDX7: Differentiable FM Synthesis of Musical Instrument Sounds}, year = {2022}, doi = {10.48550/arxiv.2208.06169}, day = {12} }
@misc{subramanian2022anomalousmethods, author = {Subramanian, V and Gururani, S and Benetos, E and Sandler, M}, month = {Jul}, title = {Anomalous behaviour in loss-gradient based interpretability methods}, year = {2022}, doi = {10.48550/arxiv.2207.07769}, day = {15} }
@inproceedings{zhao2022violinistdistributions, author = {Zhao, Y and Fazekas, G and Sandler, M}, conference = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, month = {Jan}, pages = {601--605}, title = {VIOLINIST IDENTIFICATION USING NOTE-LEVEL TIMBRE FEATURE DISTRIBUTIONS}, volume = {2022-May}, year = {2022}, abstract = {Modelling musical performers' individual playing styles based on audio features is important for music education, music expression analysis and music generation. In violin performance, the perception of playing styles are mainly affected by the characteristic musical timbre, which is mostly determined by performers, instruments and recording conditions. To verify if timbre features can describe a performer's style adequately, we examine a violinist identification method based on note-level timbre feature distributions. We first apply it using solo datasets to recognise professional violinists, then use it to identify master players from commercial concerto recordings. The results show that the designed features and method work very well for both datasets. The identification accuracy with the solo dataset using MFCCs and spectral constrast features are 0.94 and 0.91 respectively. Significantly lower but promising results are reported with the concerto dataset. Results suggest that the selected timbre features can model performers' individual playing reasonably objectively, regardless of the instrument they play.}, doi = {10.1109/ICASSP43922.2022.9747606}, isbn = {9781665405409}, issn = {1520-6149}, day = {1}, publicationstatus = {published} }
@inproceedings{ozaki2022similaritiesrecordings, author = {Ozaki, Y and Kuroyanagi, J and McBride, J and Proutskova, P and Tierney, A and Pfordresher, P and Benetos, E and Liu, F and Savage, PE}, conference = {}, month = {Sep}, organization = {Kanazawa, Japan}, title = {Similarities and differences in a cross-linguistic sample of song and speech recordings}, url = {https://sites.google.com/view/joint-conf-language-evolution/home}, year = {2022}, startyear = {2022}, startmonth = {Sep}, startday = {5}, finishyear = {2022}, finishmonth = {Sep}, finishday = {8}, booktitle = {Joint Conference on Language Evolution}, day = {5}, publicationstatus = {accepted} }
@inproceedings{wang2022jointcallrecognition, author = {Wang, C and Benetos, E and Versace, E and Wang, S}, conference = {}, month = {Aug}, organization = {Belgrade, Serbia}, pages = {195--199}, title = {Joint Scattering for Automatic Chick Call Recognition}, year = {2022}, abstract = {Animal vocalisations contain important information about health, emotional state, and behaviour, thus can be potentially used for animal welfare monitoring and behavioural neuroscience studies. Motivated by the spectro-temporal patterns of chick calls in the time–frequency domain, in this paper we propose an automatic system for chick call recognition using the joint time–frequency scattering transform (JTFS). Taking full-length recordings as input, the system first extracts chick call candidates by an onset detector and silence removal. After computing their JTFS features, a support vector machine classifier groups each candidate into different chick call types. Evaluating on a dataset comprising 3013 chick calls collected in laboratory conditions, the proposed recognition system using the JTFS features improves the frame- and event-based macro F-measures by 9.5\% and 11.7\%, respectively, than that of a melfrequency cepstral coefficients baseline. Index Terms—Audio signal processing, bioacoustics, scattering transform.}, startyear = {2022}, startmonth = {Aug}, startday = {29}, finishyear = {2022}, finishmonth = {Sep}, finishday = {2}, keyword = {audio signal processing}, keyword = {bioacoustics}, keyword = {scattering transform}, booktitle = {30th European Signal Processing Conference}, day = {29}, publicationstatus = {published} }
@inproceedings{liuperformancetracking, author = {Liu, L and KONG, Q and Morfi, G-V and Benetos, E}, conference = {}, organization = {Bengaluru, India}, title = {Performance MIDI-to-score conversion by neural beat tracking}, url = {https://cheriell.github.io/}, year = {}, abstract = {Rhythm quantisation is an essential part of converting performance MIDI recordings into musical scores. Previous works on rhythm quantisation are limited to the use of probabilistic or statistical methods. In this paper, we propose a MIDI-to-score quantisation method using a convolutional-recurrent neural network (CRNN) trained on MIDI note sequences to predict whether notes are on beats. Then, we expand the CRNN model to predict the quantised times for all beat and non-beat notes. Furthermore, we enable the model to predict the key signatures, time signatures, and hand parts of all notes. Our proposed performance MIDI-to-score system achieves significantly better performance compared to commercial software evaluated on the MV2H metric. We release the toolbox for converting performance MIDI into MIDI scores at: https://github.com/cheriell/PM2S}, startyear = {2022}, startmonth = {Dec}, startday = {4}, finishyear = {2022}, finishmonth = {Dec}, finishday = {8}, booktitle = {Proceedings of the 23rd International Society for Music Information Retrieval Conference}, publicationstatus = {accepted} }
@inproceedings{daikoku2022agreementsample, author = {Daikoku, H and Ding, S and Benetos, E and Wood, ALC and Shimizono, T and Sanne, US and Fujii, S and Savage, PE}, conference = {}, month = {Jun}, organization = {Sheffield, UK}, title = {Agreement among human and automated estimates of similarity in a global music sample}, year = {2022}, abstract = {While music information retrieval (MIR) has made substantial progress in automatic analysis of audio similarity for Western music, it remains unclear whether these algorithms can be meaningfully applied to cross-cultural analyses of more diverse musics. Here we collect perceptual ratings from 62 Japanese participants using a global sample of 30 traditional songs, and compare these ratings against both pre-existing expert annotations and audio similarity algorithms. We find that different methods of perceptual ratings all produced similar, moderate levels of inter-rater agreement comparable to previous studies, but that agreement between human and automated methods is always low regardless of the specific methods used to calculate musical similarity. Our findings suggest that the MIR methods tested are unable to measure cross-cultural music similarity in perceptually meaningful ways.}, startyear = {2022}, startmonth = {Jun}, startday = {14}, finishyear = {2022}, finishmonth = {Jun}, finishday = {17}, booktitle = {10th International Workshop on Folk Music Analysis (FMA 2022)}, day = {14}, publicationstatus = {accepted} }
@inproceedings{huang2022improvingdetection, author = {Huang, J and Benetos, E and Ewert, S}, conference = {}, month = {May}, organization = {Singapore}, pages = {451--455}, publisher = {IEEE}, title = {Improving lyrics Alignment through Joint Pitch Detection}, year = {2022}, abstract = {In recent years, the accuracy of automatic lyrics alignment methods has increased considerably. Yet, many current approaches employ frameworks designed for automatic speech recognition (ASR) and do not exploit properties specific to music. Pitch is one important musical attribute of singing voice but it is often ignored by current systems as the lyrics content is considered independent of the pitch. In practice, however, there is a temporal correlation between the two as note starts often correlate with phoneme starts. At the same time the pitch is usually annotated with high temporal accuracy in ground truth data while the timing of lyrics is often only available at the line (or word) level. In this paper, we propose a multi-task learning approach for lyrics alignment that incorporates pitch and thus can make use of a new source of highly accurate temporal information. Our results show that the accuracy of the alignment result is indeed improved by our approach. As an additional contribution, we show that integrating boundary detection in the forced-alignment algorithm reduces cross-line errors, which improves the accuracy even further.}, doi = {10.1109/ICASSP43922.2022.9746460}, startyear = {2022}, startmonth = {May}, startday = {22}, finishyear = {2022}, finishmonth = {May}, finishday = {27}, booktitle = {2022 IEEE International Conference on Acoustics, Speech and Signal Processing}, day = {22}, publicationstatus = {published} }
@inproceedings{manco2022learningsupervision, author = {Manco, I and Benetos, E and Quinton, E and Fazekas, G}, conference = {}, month = {May}, organization = {Singapore}, pages = {456--460}, publisher = {IEEE}, title = {Learning music audio representations via weak language supervision}, url = {https://ilariamanco.com/}, year = {2022}, abstract = {Audio representations for music information retrieval are typically learned via supervised learning in a task-specific fashion. Although effective at producing state-of-the-art results, this scheme lacks flexibility with respect to the range of applications a model can have and requires extensively annotated datasets. In this work, we pose the question of whether it may be possible to exploit weakly aligned text as the only supervisory signal to learn general-purpose music audio representations. To address this question, we design a multimodal architecture for music and language pre-training (MuLaP) optimised via a set of proxy tasks. Weak supervision is provided in the form of noisy natural language descriptions conveying the overall musical content of the track. After pre-training, we transfer the audio backbone of the model to a set of music audio classification and regression tasks. We demonstrate the usefulness of our approach by comparing the performance of audio representations produced by the same audio backbone with different training strategies and show that our pre-training method consistently achieves comparable or higher scores on all tasks and datasets considered. Our experiments also confirm that MuLaP effectively leverages audio-caption pairs to learn representations that are competitive with audio-only and cross-modal self-supervised methods in the literature.}, doi = {10.1109/ICASSP43922.2022.9746996}, startyear = {2022}, startmonth = {May}, startday = {22}, finishyear = {2022}, finishmonth = {May}, finishday = {27}, keyword = {audio and language}, keyword = {audio representations}, keyword = {multimodal learning}, keyword = {music information retrieval}, booktitle = {2022 IEEE International Conference on Acoustics, Speech and Signal Processing}, day = {22}, publicationstatus = {published} }
@inproceedings{ou2022exploringtranscription, author = {Ou, L and Guo, Z and Benetos, E and Han, J and Wang, Y}, conference = {}, month = {May}, organization = {Singapore}, pages = {776--780}, publisher = {IEEE}, title = {Exploring transformer's potential on automatic piano transcription}, url = {https://2022.ieeeicassp.org/}, year = {2022}, abstract = {Most recent research about automatic music transcription (AMT) uses convolutional neural networks and recurrent neural networks to model the mapping from music signals to symbolic notation. Based on a high-resolution piano transcription system, we explore the possibility of incorporating another powerful sequence transformation tool—the Transformer—to deal with the AMT problem. We argue that the properties of the Transformer make it more suitable for certain AMT subtasks. We confirm the Transformer’s superiority on the velocity detection task by experiments on the MAESTRO dataset and a cross-dataset evaluation on the MAPS dataset. We observe a performance improvement on both frame-level and note-level metrics after introducing the Transformer network.}, doi = {10.1109/ICASSP43922.2022.9746789}, startyear = {2022}, startmonth = {May}, startday = {7}, finishyear = {2022}, finishmonth = {May}, finishday = {13}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, day = {7}, publicationstatus = {published} }
@misc{ragano2022aquality, author = {Ragano, A and Benetos, E and Chinen, M and Martinez, HB and Reddy, CKA and Skoglund, J and Hines, A}, month = {Apr}, title = {A Comparison of Deep Learning MOS Predictors for Speech Synthesis Quality}, year = {2022}, doi = {10.48550/arxiv.2204.02249}, keyword = {Clinical Research}, day = {5} }
@article{ragano2022automaticarchives, author = {Ragano, A and Benetos, E and Hines, A}, journal = {Journal of the Audio Engineering Society}, month = {Apr}, number = {4}, pages = {252--270}, publisher = {Audio Engineering Society}, title = {Automatic Quality Assessment of Digitized and Restored Sound Archives}, volume = {70}, year = {2022}, abstract = {Archiving digital audio is conducted to preserve and make records accessible. However, techniques for assessing the quality of experience (QoE) of sound archives are usually neglected. In this paper, we present a framework to assess the QoE of sound archives in an automatic fashion. We describe the QoE influence factors, stakeholders, and audio archive degradations and explore the above concepts through a case study on the NASA Apollo audio archive. Each component of the framework is described in the audio archive lifecycle based on digitization, restoration, and consumption. We provide insights and real-world examples on why digitized and restored audio archives benefit from QoE assessment techniques similar to other multimedia applications such as video calling and streaming services. The reasons why stakeholders such as archivists, broadcasters, or public listeners would benefit from our proposed framework are also provided.}, doi = {10.17743/jaes.2022.0002}, issn = {0004-7554}, day = {1}, publicationstatus = {accepted} }
@article{wang2022adaptiverecognition, author = {Wang, C and Benetos, E and Lostanlen, V and Chew, E}, journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing}, month = {Mar}, pages = {1407--1421}, publisher = {Institute of Electrical and Electronics Engineers}, title = {Adaptive Scattering Transforms for Playing Technique Recognition}, url = {https://changhongw.github.io/}, volume = {30}, year = {2022}, abstract = {Playing techniques contain distinctive information about musical expressivity and interpretation. Yet, current research in music signal analysis suffers from a scarcity of computational models for playing techniques, especially in the context of live performance. To address this problem, our paper develops a general framework for playing technique recognition. We propose the adaptive scattering transform, which refers to any scattering transform that includes a stage of data-driven dimensionality reduction over at least one of its wavelet variables, for representing playing techniques. Two adaptive scattering features are presented: frequency-adaptive scattering and direction-adaptive scattering. We analyse seven playing techniques: vibrato, tremolo, trill, flutter-tongue, acciaccatura, portamento, and glissando. To evaluate the proposed methodology, we create a new dataset containing full-length Chinese bamboo flute performances (CBFdataset) with expert playing technique annotations. Once trained on the proposed scattering representations, a support vector classifier achieves state-of-the-art results. We provide explanatory visualisations of scattering coefficients for each technique and verify the system over three additional datasets with various instrumental and vocal techniques: VPset, SOL, and VocalSet.}, doi = {10.1109/TASLP.2022.3156785}, issn = {2329-9304}, keyword = {music performance analysis}, keyword = {music signal analysis}, keyword = {scattering transform}, day = {7}, publicationstatus = {published} }
@article{benetos2022measuringdata, author = {Benetos, E and Ragano, A and Sgroi, D and Tuckwell, A}, journal = {Behavior Research Methods}, month = {Feb}, publisher = {Springer (part of Springer Nature)}, title = {Measuring national mood with music: using machine learning to construct a measure of national valence from audio data}, year = {2022}, abstract = {We propose a new measure of national valence based on the emotional content of a country’s most popular songs. We first trained a machine learning model using 191 different audio features embedded within music and use this model to construct a long-run valence index for the UK. This index correlates strongly and significantly with survey-based life satisfaction and outperforms an equivalent text-based measure. Our methods have the potential to be applied widely and to provide a solution to the severe lack of historical time-series data on psychological well-being.}, doi = {10.3758/s13428-021-01747-7}, issn = {1554-351X}, day = {25}, publicationstatus = {published} }
@article{terenzi2022comparisonactivity, author = {Terenzi, A and Ortolani, N and De Almeida Nolasco, I and Benetos, E and Cecchi, S}, journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing}, month = {Jan}, pages = {112--122}, publisher = {Institute of Electrical and Electronics Engineers}, title = {Comparison of feature extraction methods for sound-based classification of honey bee activity}, volume = {30}, year = {2022}, abstract = {Honey bees are one of the most important insects on the planet since they play a key role in the pollination services of both cultivated and spontaneous flora. Recent years have seen an increase in bee mortality which points out the necessity of intensive beehive monitoring in order to better understand this phenomenon and try to help these important insects. In this scenario, this work presents an algorithm for sound-based classification of honey bee activity reporting a preliminary comparison between various extracted features used separately as input to a convolutional neural network classifier. In particular, the orphaned colony situation has been considered using a dataset acquired in a real situation. Different experiments with different setups have been carried out in order to test the performance of the proposed system, and the results have confirmed its potentiality.}, doi = {10.1109/TASLP.2021.3133194}, issn = {2329-9304}, keyword = {Convolutional neural networks}, keyword = {feature extraction}, keyword = {continuous wavelet transform}, keyword = {Hilbert-Huang transform}, keyword = {mel frequency cepstrum coefficients}, keyword = {honey bees}, keyword = {bioacoustics}, keyword = {computational bioacoustic scene analysis}, day = {1}, publicationstatus = {published} }
@article{stowell2022computationalroadmap, author = {Stowell, D}, journal = {PeerJ}, month = {Mar}, title = {Computational bioacoustics with deep learning: a review and roadmap}, year = {2022}, abstract = {Animal vocalisations and natural soundscapes are fascinating objects of study, and contain valuable evidence about animal behaviours, populations and ecosystems. They are studied in bioacoustics and ecoacoustics, with signal processing and analysis an important component. Computational bioacoustics has accelerated in recent decades due to the growth of affordable digital sound recording devices, and to huge progress in informatics such as big data, signal processing and machine learning. Methods are inherited from the wider field of deep learning, including speech and image processing. However, the tasks, demands and data characteristics are often different from those addressed in speech or music analysis. There remain unsolved problems, and tasks for which evidence is surely present in many acoustic signals, but not yet realised. In this paper I perform a review of the state of the art in deep learning for computational bioacoustics, aiming to clarify key concepts and identify and analyse knowledge gaps. Based on this, I offer a subjective but principled roadmap for computational bioacoustics with deep learning: topics that the community should aim to address, in order to make the most of future developments in AI and informatics, and to use audio data in answering zoological and ecological questions.}, doi = {10.7717/peerj.13152}, eissn = {2167-8359}, day = {21}, publicationstatus = {published} }
@article{linhart2022themammals, author = {Linhart, P and Mahamoud-Issa, M and Stowell, D and Blumstein, DT}, journal = {Mammalian Biology}, month = {Jan}, title = {The potential for acoustic individual identification in mammals}, year = {2022}, abstract = {Many studies have revealed that animal vocalizations, including those from mammals, are individually distinctive. Therefore, acoustic identification of individuals (AIID) has been repeatedly suggested as a non-invasive and labor efficient alternative to mark-recapture identification methods. We present a pipeline of steps for successful AIID in a given species. By conducting such work, we will also improve our understanding of identity signals in general. Strong and stable acoustic signatures are necessary for successful AIID. We reviewed studies of individual variation in mammalian vocalizations as well as pilot studies using acoustic identification to census mammals and birds. We found the greatest potential for AIID (characterized by strong and stable acoustic signatures) was in Cetacea and Primates (including humans). In species with weaker acoustic signatures, AIID could still be a valuable tool once its limitations are fully acknowledged. A major obstacle for widespread utilization of AIID is the absence of tools integrating all AIID subtasks within a single package. Automation of AIID could be achieved with the use of advanced machine learning techniques inspired by those used in human speaker recognition or tailored to specific challenges of animal AIID. Unfortunately, further progress in this area is currently hindered by the lack of appropriate publicly available datasets. However, we believe that after overcoming the issues outlined above, AIID can quickly become a widespread and valuable tool in field research and conservation of mammals and other animals.}, doi = {10.1007/s42991-021-00222-2}, issn = {1616-5047}, eissn = {1618-1476}, day = {1}, publicationstatus = {published} }
@inproceedings{nolasco2022rankbasedrepresentations, author = {Nolasco, I and Stowell, D}, conference = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, month = {Jan}, pages = {3623--3627}, title = {RANK-BASED LOSS FOR LEARNING HIERARCHICAL REPRESENTATIONS}, volume = {2022-May}, year = {2022}, abstract = {Hierarchical taxonomies are common in many contexts, and they are a very natural structure humans use to organise information. In machine learning, the family of methods that use this”extra” information is called hierarchical classification. However, applied to audio classification, this remains relatively unexplored. Here we focus on how to integrate the hierarchical information of a problem to learn embeddings representative of the hierarchical relationships. Previously, triplet loss has been proposed to address this problem, however it presents some issues like requiring the careful construction of the triplets, and being limited in the extent of hierarchical information it uses at each iteration. In this work we propose a rank based loss function that uses hierarchical information and translates this into a rank ordering of target distances between the examples. We show that rank based loss is suitable to learn hierarchical representations of the data. By testing on unseen fine level classes we show that this method is also capable of learning hierarchically correct representations of the new classes. Rank based loss has two promising aspects, it is generalisable to hierarchies with any number of levels, and is capable of dealing with data with incomplete hierarchical labels.}, doi = {10.1109/ICASSP43922.2022.9746907}, isbn = {9781665405409}, issn = {1520-6149}, day = {1}, publicationstatus = {published} }
@inproceedings{liang2022leveragingrecognition, author = {Liang, J and Phan, QH and Benetos, E}, conference = {}, month = {Nov}, organization = {Nancy, France}, title = {Leveraging label hierarchies for few-shot everyday sound recognition}, url = {https://jinhualiang.github.io/}, url = {https://dcase.community/workshop2022/}, year = {2022}, abstract = {Everyday sounds cover a considerable range of sound categories in our daily life, yet for certain sound categories it is hard to collect sufficient data. Although existing works have applied few-shot learning paradigms to sound recognition successfully, most of them have not exploited the relationship between labels in audio taxonomies. This work adopts a hierarchical prototypical network to leverage the knowledge rooted in audio taxonomies. Specifically, a VGG-like convolutional neural network is used to extract acoustic features. Prototypical nodes are then calculated in each level of the tree structure. A multi-level loss is obtained by multiplying a weight decay with multiple losses. Experimental results demonstrate our hierarchical prototypical networks not only outperform prototypical networks with no hierarchy information but yield a better result than other state-of-the art algorithms. Our code is available in: https://github.com/JinhuaLiang/HPNs_tagging}, startyear = {2022}, startmonth = {Nov}, startday = {3}, finishyear = {2022}, finishmonth = {Nov}, finishday = {4}, keyword = {everyday sound recognition}, keyword = {few-shot learning}, keyword = {hierarchical prototypical network}, booktitle = {7th Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)}, day = {3}, publicationstatus = {accepted} }
@inproceedings{lifewshotnetworks, author = {Li, R and Liang, J and Phan, QH}, conference = {}, organization = {Nancy, France}, title = {Few-Shot Bioacoustic Event Detection: Enhanced Classifiers for Prototypical Networks}, url = {https://dcase.community/workshop2022/}, year = {}, startyear = {2022}, startmonth = {Nov}, startday = {4}, finishyear = {2022}, finishmonth = {Nov}, finishday = {3}, booktitle = {7th Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)}, publicationstatus = {accepted} }
@inproceedings{singh2022hypernetworksproofofconcept, author = {Singh, S and Benetos, E and Phan, QH}, conference = {}, month = {Aug}, organization = {Belgrade, Serbia}, pages = {429--433}, publisher = {EURASIP}, title = {Hypernetworks for sound event detection: a proof-of-concept}, year = {2022}, abstract = {Polyphonic sound event detection (SED) involves the prediction of sound events present in an audio recording along with their onset and offset times. Recently, Deep Neural Networks, specifically convolutional recurrent neural networks (CRNN) have achieved impressive results for this task. The convolution part of the architecture is used to extract translational invariant features from the input and the recurrent part learns the underlying temporal relationship between audio frames. Recent studies showed that the weight sharing paradigm of recurrent networks might be a hindering factor in certain kinds of time series data, specifically where there is a temporal conditional shift, i.e. the conditional distribution of a label changes across the temporal scale. This warrants a relevant question - is there a similar phenomenon in polyphonic sound events due to dynamic polyphony level across the temporal axis? In this work, we explore this question and inquire if relaxed weight sharing improves performance of a CRNN for polyphonic SED. We propose to use hypernetworks to relax weight sharing in the recurrent part and show that the CRNN’s performance is improved by ~3\% across two datasets, thus paving the way for further exploration of the existence of temporal conditional shift for polyphonic SED.}, startyear = {2022}, startmonth = {Aug}, startday = {29}, finishyear = {2022}, finishmonth = {Sep}, finishday = {3}, booktitle = {30th European Signal Processing Conference (EUSIPCO 2022)}, day = {29}, publicationstatus = {published} }
@article{phan2022xsleepnetstaging, author = {Phan, H and Chen, OY and Tran, MC and Koch, P and Mertins, A and De Vos, M}, journal = {IEEE Trans Pattern Anal Mach Intell}, month = {Sep}, number = {9}, organization = {United States}, pages = {5903--5915}, title = {XSleepNet: Multi-View Sequential Model for Automatic Sleep Staging.}, url = {https://www.ncbi.nlm.nih.gov/pubmed/33788679}, volume = {44}, year = {2022}, abstract = {Automating sleep staging is vital to scale up sleep assessment and diagnosis to serve millions experiencing sleep deprivation and disorders and enable longitudinal sleep monitoring in home environments. Learning from raw polysomnography signals and their derived time-frequency image representations has been prevalent. However, learning from multi-view inputs (e.g., both the raw signals and the time-frequency images) for sleep staging is difficult and not well understood. This work proposes a sequence-to-sequence sleep staging model, XSleepNet,1 that is capable of learning a joint representation from both raw signals and time-frequency images. Since different views may generalize or overfit at different rates, the proposed network is trained such that the learning pace on each view is adapted based on their generalization/overfitting behavior. In simple terms, the learning on a particular view is speeded up when it is generalizing well and slowed down when it is overfitting. View-specific generalization/overfitting measures are computed on-the-fly during the training course and used to derive weights to blend the gradients from different views. As a result, the network is able to retain the representation power of different views in the joint features which represent the underlying distribution better than those learned by each individual view alone. Furthermore, the XSleepNet architecture is principally designed to gain robustness to the amount of training data and to increase the complementarity between the input views. Experimental results on five databases of different sizes show that XSleepNet consistently outperforms the single-view baselines and the multi-view baseline with a simple fusion strategy. Finally, XSleepNet also outperforms prior sleep staging methods and improves previous state-of-the-art results on the experimental databases.}, doi = {10.1109/TPAMI.2021.3070057}, eissn = {1939-3539}, keyword = {Algorithms}, keyword = {Electroencephalography}, keyword = {Polysomnography}, keyword = {Sleep}, keyword = {Sleep Stages}, language = {eng}, publicationstatus = {published} }
@article{heremans2022featureeeg, author = {Heremans, ERM and Phan, H and Ansari, AH and Borzée, P and Buyse, B and Testelmans, D and De Vos, M}, journal = {Biomedical Signal Processing and Control}, month = {Jul}, pages = {104009--104009}, publisher = {Elsevier}, title = {Feature matching as improved transfer learning technique for wearable EEG}, volume = {78}, year = {2022}, doi = {10.1016/j.bspc.2022.104009}, issn = {1746-8094}, day = {27}, publicationstatus = {published} }
@article{phan2022sleeptransformerquantification, author = {Phan, H and Mikkelsen, K and Chen, OY and Koch, P and Mertins, A and De Vos, M}, journal = {IEEE Trans Biomed Eng}, month = {Aug}, number = {8}, organization = {United States}, pages = {2456--2467}, title = {SleepTransformer: Automatic Sleep Staging With Interpretability and Uncertainty Quantification.}, url = {https://www.ncbi.nlm.nih.gov/pubmed/35100107}, volume = {69}, year = {2022}, abstract = {BACKGROUND: Black-box skepticism is one of the main hindrances impeding deep-learning-based automatic sleep scoring from being used in clinical environments. METHODS: Towards interpretability, this work proposes a sequence-to-sequence sleep-staging model, namely SleepTransformer. It is based on the transformer backbone and offers interpretability of the model's decisions at both the epoch and sequence level. We further propose a simple yet efficient method to quantify uncertainty in the model's decisions. The method, which is based on entropy, can serve as a metric for deferring low-confidence epochs to a human expert for further inspection. RESULTS: Making sense of the transformer's self-attention scores for interpretability, at the epoch level, the attention scores are encoded as a heat map to highlight sleep-relevant features captured from the input EEG signal. At the sequence level, the attention scores are visualized as the influence of different neighboring epochs in an input sequence (i.e. the context) to recognition of a target epoch, mimicking the way manual scoring is done by human experts. CONCLUSION: Additionally, we demonstrate that SleepTransformer performs on par with existing methods on two databases of different sizes. SIGNIFICANCE: Equipped with interpretability and the ability of uncertainty quantification, SleepTransformer holds promise for being integrated into clinical settings.}, doi = {10.1109/TBME.2022.3147187}, eissn = {1558-2531}, keyword = {Electroencephalography}, keyword = {Humans}, keyword = {Polysomnography}, keyword = {Sleep}, keyword = {Sleep Stages}, keyword = {Uncertainty}, language = {eng}, publicationstatus = {published} }
@article{heremansfromstaging, author = {Heremans, ERM and Phan, H and Borzée, P and Buyse, B and Testelmans, D and De Vos, M}, journal = {Journal of Neural Engineering}, publisher = {IOP Publishing}, title = {From unsupervised to semi-supervised adversarial domain adaptation in EEG-based sleep staging.}, url = {https://www.ncbi.nlm.nih.gov/pubmed/35508121}, year = {}, abstract = {OBJECTIVE: The recent breakthrough of wearable sleep monitoring devices results in large amounts of sleep data. However, as limited labels are available, interpreting these data requires automated sleep stage classification methods with a small need for labeled training data. Transfer learning and domain adaptation offer possible solutions by enabling models to learn on a source dataset and adapt to a target dataset. APPROACH: In this paper, we investigate adversarial domain adaptation applied to real use cases with wearable sleep datasets acquired from diseased patient populations. Different practical aspects of the adversarial domain adaptation framework \hlare examined, including the added value of (pseudo-)labels from the target dataset and the influence of domain mismatch between the source and target data. The method is also implemented for personalization to specific patients. MAIN RESULTS: The results show that adversarial domain adaptation is effective in the application of sleep staging on wearable data. When compared to a model applied on a target dataset without any adaptation, the domain adaptation method in its simplest form achieves relative gains of 7\%-27\% in accuracy. The performance on the target domain is further boosted by adding pseudo-labels and real target domain labels when available, and by choosing an appropriate source dataset. Furthermore, unsupervised adversarial domain adaptation can also personalize a model, improving the performance by 1\%-2\% compared to a non-personal model. SIGNIFICANCE: In conclusion, adversarial domain adaptation provides a flexible framework for semi-supervised and unsupervised transfer learning. This is particularly useful in sleep staging and other wearable EEG applications.}, doi = {10.1088/1741-2552/ac6ca8}, issn = {1741-2552}, eissn = {1741-2552}, keyword = {deep learning}, keyword = {domain adaptation}, keyword = {electroencephalography}, keyword = {sleep stage classification}, keyword = {transfer learning}, language = {eng}, publicationstatus = {accepted} }
@article{phan2022pediatricmethods, author = {Phan, H and Mertins, A and Baumert, M}, journal = {IEEE Trans Biomed Eng}, month = {May}, organization = {United States}, title = {Pediatric Automatic Sleep Staging: A comparative study of state-of-the-art deep learning methods.}, url = {https://www.ncbi.nlm.nih.gov/pubmed/35552153}, volume = {PP}, year = {2022}, abstract = {Despite the tremendous progress recently made towards automatic sleep staging in adults, it is currently unknown if the most advanced algorithms generalize to the pediatric population, which displays distinctive characteristics in overnight polysomnography (PSG). To answer the question, in this work, we conduct a large-scale comparative study on the state-of-the-art deep learning methods for pediatric automatic sleep staging. Six different deep neural networks with diverging features are adopted to evaluate a sample of more than 1,200 children across a wide spectrum of obstructive sleep apnea (OSA) severity. Our experimental results show that the individual performance of automated pediatric sleep stagers when evaluated on new subjects is equivalent to the expert-level one reported on adults. Combining the six stagers into ensemble models further boosts the staging accuracy, reaching an overall accuracy of 88.8\%, a Cohens kappa of 0.852, and a macro F1-score of 85.8\%. At the same time, the ensemble models lead to reduced predictive uncertainty. The results also show that the studied algorithms and their ensembles are robust to concept drift when the training and test data were recorded seven months apart and after clinical intervention. However, we show that the improvements in the staging performance are not necessarily clinically significant although the ensemble models lead to more favorable clinical measures than the six standalone models. Detailed analyses further demonstrate "almost perfect" agreement between the automatic stagers to one another and their similar patterns on the staging errors, suggesting little room for improvement.}, doi = {10.1109/TBME.2022.3174680}, eissn = {1558-2531}, language = {eng}, day = {12}, publicationstatus = {online-published} }
@article{phan2022automaticdirections, author = {Phan, H and Mikkelsen, K}, journal = {Physiol Meas}, month = {Apr}, number = {4}, organization = {England}, title = {Automatic sleep staging of EEG signals: recent development, challenges, and future directions.}, url = {https://www.ncbi.nlm.nih.gov/pubmed/35320788}, volume = {43}, year = {2022}, abstract = {Modern deep learning holds a great potential to transform clinical studies of human sleep. Teaching a machine to carry out routine tasks would be a tremendous reduction in workload for clinicians. Sleep staging, a fundamental step in sleep practice, is a suitable task for this and will be the focus in this article. Recently, automatic sleep-staging systems have been trained to mimic manual scoring, leading to similar performance to human sleep experts, at least on scoring of healthy subjects. Despite tremendous progress, we have not seen automatic sleep scoring adopted widely in clinical environments. This review aims to provide the shared view of the authors on the most recent state-of-the-art developments in automatic sleep staging, the challenges that still need to be addressed, and the future directions needed for automatic sleep scoring to achieve clinical value.}, doi = {10.1088/1361-6579/ac6049}, eissn = {1361-6579}, keyword = {EEG}, keyword = {automatic sleep staging}, keyword = {deep learning}, keyword = {deep neural networks}, keyword = {sleep monitoring}, keyword = {sleep scoring}, keyword = {Electroencephalography}, keyword = {Humans}, keyword = {Polysomnography}, keyword = {Sleep}, keyword = {Sleep Stages}, language = {eng}, day = {28}, publicationstatus = {online-published} }
@article{mikkelsen2022sleepconfigurations, author = {Mikkelsen, KB and Phan, H and Rank, ML and Hemmsen, MC and de Vos, M and Kidmose, P}, journal = {IEEE Trans Biomed Eng}, month = {May}, number = {5}, organization = {United States}, pages = {1564--1572}, title = {Sleep Monitoring Using Ear-Centered Setups: Investigating the Influence From Electrode Configurations.}, url = {https://www.ncbi.nlm.nih.gov/pubmed/34587000}, volume = {69}, year = {2022}, abstract = {Modern sleep monitoring development is shifting towards the use of unobtrusive sensors combined with algorithms for automatic sleep scoring. Many different combinations of wet and dry electrodes, ear-centered, forehead-mounted or headband-inspired designs have been proposed, alongside an ever growing variety of machine learning algorithms for automatic sleep scoring. OBJECTIVE: Among candidate positions, those in the facial area and around the ears have the benefit of being relatively hairless, and in our view deserve extra attention. In this paper, we seek to determine the limits to sleep monitoring quality within this spatial constraint. METHODS: We compare 13 different, realistic sensor setups derived from the same data set and analysed with the same pipeline. RESULTS: All setups which include both a lateral and an EOG derivation show similar, state-of-the-art performance, with average Cohen's kappa values of at least 0.80. CONCLUSION: If large electrode distances are used, positioning is not critical for achieving large sleep-related signal-to-noise-ratio, and hence accurate sleep scoring. SIGNIFICANCE: We argue that with the current competitive performance of automated staging approaches, there is a need for establishing an improved benchmark beyond current single human rater scoring.}, doi = {10.1109/TBME.2021.3116274}, eissn = {1558-2531}, keyword = {Algorithms}, keyword = {Electrodes}, keyword = {Electroencephalography}, keyword = {Humans}, keyword = {Polysomnography}, keyword = {Sleep}, keyword = {Sleep Stages}, language = {eng}, publicationstatus = {published} }
@inproceedings{nguyen2022salsalitearrays, author = {Nguyen, TNT and Jones, DL and Watcharasupat, KN and Phan, H and Gan, WS}, conference = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, month = {Jan}, pages = {716--720}, title = {SALSA-LITE: A FAST AND EFFECTIVE FEATURE FOR POLYPHONIC SOUND EVENT LOCALIZATION AND DETECTION WITH MICROPHONE ARRAYS}, volume = {2022-May}, year = {2022}, abstract = {Polyphonic sound event localization and detection (SELD) has many practical applications in acoustic sensing and monitoring. However, the development of real-time SELD has been limited by the demanding computational requirement of most recent SELD systems. In this work, we introduce SALSA-Lite, a fast and effective feature for polyphonic SELD using microphone array inputs. SALSA-Lite is a lightweight variation of a previously proposed SALSA feature for polyphonic SELD. SALSA, which stands for Spatial Cue-Augmented Log-Spectrogram, consists of multichannel log-spectrograms stacked channelwise with the normalized principal eigenvectors of the spectrotemporally corresponding spatial covariance matrices. In contrast to SALSA, which uses eigenvector-based spatial features, SALSA-Lite uses normalized inter-channel phase differences as spatial features, allowing a 30-fold speedup compared to the original SALSA feature. Experimental results on the TAU-NIGENS Spatial Sound Events 2021 dataset showed that the SALSA-Lite feature achieved competitive performance compared to the full SALSA feature, and significantly outperformed the traditional feature set of multichannel log-mel spectrograms with generalized cross-correlation spectra. Specifically, using SALSA-Lite features increased localization-dependent F1 score and class-dependent localization recall by 15 \% and 5 \%, respectively, compared to using multichannel log-mel spectrograms with generalized cross-correlation spectra.}, doi = {10.1109/ICASSP43922.2022.9746132}, isbn = {9781665405409}, issn = {1520-6149}, day = {1}, publicationstatus = {published} }
@inproceedings{phan2022polyphonicproblem, author = {Phan, H and Nguyen, TNT and Koch, P and Mertins, A}, conference = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, month = {Jan}, pages = {8877--8881}, title = {POLYPHONIC AUDIO EVENT DETECTION: MULTI-LABEL OR MULTI-CLASS MULTI-TASK CLASSIFICATION PROBLEM?}, volume = {2022-May}, year = {2022}, abstract = {Polyphonic events are the main error source of audio event detection (AED) systems. In deep-learning context, the most common approach to deal with event overlaps is to treat the AED task as a multi-label classification problem. By doing this, we inherently consider multiple one-vs.-rest classification problems, which are jointly solved by a single (i.e. shared) network. In this work, to better handle polyphonic mixtures, we propose to frame the task as a multi-class classification problem by considering each possible label combination as one class. To circumvent the large number of arising classes due to combinatorial explosion, we divide the event categories into multiple groups and construct a multi-task problem in a divide-and-conquer fashion, where each of the tasks is a multi-class classification problem. A network architecture is then devised for multi-class multi-task modelling. The network is composed of a backbone subnet and multiple task-specific subnets. The task-specific subnets are designed to learn time-frequency and channel attention masks to extract features for the task at hand from the common feature maps learned by the backbone. Experiments on the TUT-SED-Synthetic-2016 with high degree of event overlap show that the proposed approach results in more favorable performance than the common multi-label approach.}, doi = {10.1109/ICASSP43922.2022.9746402}, isbn = {9781665405409}, issn = {1520-6149}, day = {1}, publicationstatus = {published} }
This file was generated by bibtex2html 1.96.