@article{10.1145/3617897, title = {Machine Learning (In) Security: A Stream of Problems}, author = {Fabrício Ceschin and Marcus Botacin and Albert Bifet and Bernhard Pfahringer and Luiz S Oliveira and Heitor Murilo Gomes and André Grégio}, url = {https://doi.org/10.1145/3617897 https://secret.inf.ufpr.br/papers/fabricio_mlinsec_dtrap.pdf}, doi = {10.1145/3617897}, issn = {2692-1626}, year = {2023}, date = {2023-09-01}, journal = {Digital Threats}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, abstract = {Machine Learning (ML) has been widely applied to cybersecurity and is considered state-of-the-art for solving many of the open issues in that field. However, it is very difficult to evaluate how good the produced solutions are, since the challenges faced in security may not appear in other areas. One of these challenges is the concept drift, which increases the existing arms race between attackers and defenders: malicious actors can always create novel threats to overcome the defense solutions, which may not consider them in some approaches. Due to this, it is essential to know how to properly build and evaluate an ML-based security solution. In this paper, we identify, detail, and discuss the main challenges in the correct application of ML techniques to cybersecurity data. We evaluate how concept drift, evolution, delayed labels, and adversarial ML impact the existing solutions. Moreover, we address how issues related to data collection affect the quality of the results presented in the security literature, showing that new strategies are needed to improve current solutions. Finally, we present how existing solutions may fail under certain circumstances, and propose mitigations to them, presenting a novel checklist to help the development of future ML solutions for cybersecurity.}, note = {Just Accepted}, keywords = {cybersecurity, Data streams, Machine learning}, pubstate = {published}, tppubtype = {article} } @inproceedings{10.1007/978-981-99-5177-2_1, title = {People Still Care About Facts: Twitter Users Engage More with Factual Discourse than Misinformation}, author = {Luiz Giovanini and Shlok Gilda and Mirela Silva and Fabrício Ceschin and Prakash Shrestha and Christopher Brant and Juliana Fernandes and Catia S Silva and André Grégio and Daniela Oliveira}, url = {https://doi.org/10.1007/978-981-99-5177-2_1 https://secret.inf.ufpr.br/papers/People%20Still%20Care%20About%20Facts.pdf}, isbn = {978-981-99-5177-2}, year = {2023}, date = {2023-08-03}, booktitle = {Security and Privacy in Social Networks and Big Data}, pages = {3--22}, publisher = {Springer Nature Singapore}, address = {Singapore}, abstract = {Misinformation entails disseminating falsehoods that lead to society's slow fracturing via decreased trust in democratic processes, institutions, and science. The public has grown aware of the role of social media as a superspreader of untrustworthy information, where even pandemics have not been immune. In this paper, we focus on COVID-19 misinformation and examine a subset of 2.1M tweets to understand misinformation as a function of engagement, tweet content (COVID-19- vs. non-COVID-19-related), and veracity (misleading or factual). Using correlation analysis, we show the most relevant feature subsets among over 126 features that most heavily correlate with misinformation or facts. We found that (i) factual tweets, regardless of whether COVID-related, were more engaging than misinformation tweets; and (ii) features that most heavily correlated with engagement varied depending on the veracity and content of the tweet.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{10.1145/3587471, title = {ANDROIDGYNY: Reviewing Clustering Techniques for Android Malware Family Classification}, author = {Thalita Scharr Rodrigues Pimenta and Fabricio Ceschin and Andre Gregio}, url = {https://doi.org/10.1145/3587471 https://secret.inf.ufpr.br/papers/androidgyny_thalita.pdf}, doi = {10.1145/3587471}, issn = {2692-1626}, year = {2023}, date = {2023-03-01}, journal = {Digital Threats}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, abstract = {Thousands of malicious applications (apps) are daily created, modified with the aid of automation tools, and released on the World Wide Web. Several techniques have been applied over the years to identify whether an APK is malicious or not. The use of these techniques intends to identify unknown malware mainly by calculating the similarity of a sample with previously grouped, already known families of malicious apps. Thus, high rates of accuracy would enable several countermeasures: from further quick detection to the development of vaccines and aid for reverse engineering new variants. However, most of the literature consists of limited experiments—either short-term and offline or based exclusively on well-known malicious apps’ families. In this paper, we explore the use of malware phylogeny, a term borrowed from biology, consisting of the genealogical study of the relationship between elements and families. Also, we investigate the literature on clustering techniques applied to mobile malware classification and discuss how researchers have been setting up their experiments.}, note = {Just Accepted}, keywords = {Classification, Mobile Malware, Phylogeny}, pubstate = {published}, tppubtype = {article} } @article{CESCHIN2022118590, title = {Fast & Furious: On the modelling of malware detection as an evolving data stream}, author = {Fabrício Ceschin and Marcus Botacin and Heitor Murilo Gomes and Felipe Pinagé and Luiz S Oliveira and André Grégio}, url = {https://www.sciencedirect.com/science/article/pii/S0957417422016463 https://secret.inf.ufpr.br/papers/fabricio_eswa_22.pdf}, doi = {https://doi.org/10.1016/j.eswa.2022.118590}, issn = {0957-4174}, year = {2022}, date = {2022-08-22}, journal = {Expert Systems with Applications}, pages = {118590}, abstract = {Malware is a major threat to computer systems and imposes many challenges to cyber security. Targeted threats, such as ransomware, cause millions of dollars in losses every year. The constant increase of malware infections has been motivating popular antiviruses (AVs) to develop dedicated detection strategies, which include meticulously crafted machine learning (ML) pipelines. However, malware developers unceasingly change their samples’ features to bypass detection. This constant evolution of malware samples causes changes to the data distribution (i.e., concept drifts) that directly affect ML model detection rates, something not considered in the majority of the literature work. In this work, we evaluate the impact of concept drift on malware classifiers for two Android datasets: DREBIN (≈130K apps) and a subset of AndroZoo (≈285K apps). We used these datasets to train an Adaptive Random Forest (ARF) classifier, as well as a Stochastic Gradient Descent (SGD) classifier. We also ordered all datasets samples using their VirusTotal submission timestamp and then extracted features from their textual attributes using two algorithms (Word2Vec and TF-IDF). Then, we conducted experiments comparing both feature extractors, classifiers, as well as four drift detectors (Drift Detection Method, Early Drift Detection Method, ADaptive WINdowing, and Kolmogorov–Smirnov WINdowing) to determine the best approach for real environments. Finally, we compare some possible approaches to mitigate concept drift and propose a novel data stream pipeline that updates both the classifier and the feature extractor. To do so, we conducted a longitudinal evaluation by (i) classifying malware samples collected over nine years (2009–2018), (ii) reviewing concept drift detection algorithms to attest its pervasiveness, (iii) comparing distinct ML approaches to mitigate the issue, and (iv) proposing an ML data stream pipeline that outperformed literature approaches, achieving an improvement of 22.05 percentage points of F1Score in the DREBIN dataset, and 8.77 in the AndroZoo dataset.}, keywords = {Android, Concept drift, Data streams, Machine learning, malware detection}, pubstate = {published}, tppubtype = {article} } @article{9786768, title = {Online Binary Models are Promising for Distinguishing Temporally Consistent Computer Usage Profiles}, author = {Luiz Giovanini and Fabrício Ceschin and Mirela Silva and Aokun Chen and Ramchandra Kulkarni and Sanjay Banda and Madison Lysaght and Heng Qiao and Nikolaos Sapountzis and Ruimin Sun and Brandon Matthews and Dapeng Oliver Wu and André Grégio and Daniela Oliveira}, url = {https://secret.inf.ufpr.br/papers/UsageProfiles_IEEE_TBIOM_2021.pdf}, doi = {10.1109/TBIOM.2022.3179206}, year = {2022}, date = {2022-06-03}, journal = {IEEE Transactions on Biometrics, Behavior, and Identity Science}, pages = {1-1}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{10.1145/3494535, title = { Terminator: A Secure Coprocessor to Accelerate Real-Time AntiViruses Using Inspection Breakpoints}, author = {Marcus Botacin and Francis B Moreira and Philippe O A Navaux and André Grégio and Marco A Z Alves}, url = {https://doi.org/10.1145/3494535 https://secret.inf.ufpr.br/papers/marcus_coproc.pdf}, doi = {10.1145/3494535}, issn = {2471-2566}, year = {2022}, date = {2022-03-01}, journal = {ACM Trans. Priv. Secur.}, volume = {25}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, abstract = {AntiViruses (AVs) are essential to face the myriad of malware threatening Internet users. AVs operate in two modes: on-demand checks and real-time verification. Software-based real-time AVs intercept system and function calls to execute AV’s inspection routines, resulting in significant performance penalties as the monitoring code runs among the suspicious code. Simultaneously, dark silicon problems push the industry to add more specialized accelerators inside the processor to mitigate these integration problems. In this article, we propose Terminator, an AV-specific coprocessor to assist software AVs by outsourcing their matching procedures to the hardware, thus saving CPU cycles and mitigating performance degradation. We designed Terminator   to be flexible and compatible with existing AVs by using YARA and ClamAVrules. Our experiments show that our approach can save up to 70 million CPU cycles per rule when outsourcing on-demand checks for matching typical, unmodified YARA rules against a dataset of 30 thousand in-the-wild malware samples. Our proposal eliminates the AV’s need for blocking the CPU to perform full system checks, which can now occur in parallel. We also designed a new inspection breakpoint mechanism that signals to the coprocessor the beginning of a monitored region, allowing it to scan the regions in parallel with their execution. Overall, our mechanism mitigated up to 44% of the overhead imposed to execute and monitor the SPEC benchmark applications in the most challenging scenario.}, keywords = {antivirus, coprocessor, malware}, pubstate = {published}, tppubtype = {article} } @article{BOTACIN2022117083, title = {HEAVEN: A Hardware-Enhanced AntiVirus ENgine to accelerate real-time, signature-based malware detection}, author = {Marcus Botacin and Marco Zanata Alves and Daniela Oliveira and André Grégio}, url = {https://www.sciencedirect.com/science/article/pii/S0957417422004882 https://secret.inf.ufpr.br/papers/marcus_heaven.pdf}, doi = {https://doi.org/10.1016/j.eswa.2022.117083}, issn = {0957-4174}, year = {2022}, date = {2022-01-01}, journal = {Expert Systems with Applications}, pages = {117083}, abstract = {Antiviruses (AVs) are computing-intensive applications that rely on constant monitoring of OS events and on applying pattern matching procedures on binaries to detect malware. In this paper, we introduce HEAVEN, a framework for Intel x86/x86-64 and MS Windows that combines hardware and software to improve AVs performance. HEAVEN workflow consists of a hardware-assisted signature matching process as its first step (triage), which is fast, and only invokes the software-based AV when the software is suspicious, i.e., with an unknown hardware signature for malignity. We implement a PoC for HEAVEN by instrumenting Intel’s x86/x86-64 branch predictor, which allows for the generation of malware signatures based on branch pattern history. To validate our PoC, we evaluate HEAVEN with a dataset composed of 10,000 malware and 1,000 benign software samples from different categories and accomplished malware detection rates of 100% (no false-positives). The detection occurred before the execution of 10% of the samples’ code. HEAVEN is designed to be memory efficient: it identified unique 32-bit signatures for each sample at the storage cost of only 35KB of SRAM. HEAVEN is also designed with processing efficiency in mind: its hardware extensions present negligible performance overhead and reduces the average workload of the chosen software AV counterpart (ClamWin)—10% for CPU usage, 5.61% for memory throughput, 16.22% for disk writes, and 20.22% for disk reads. With HEAVEN, we may decrease the number of CPU cycles used for malware scanning by 87.5%, which is a promising result regarding the feasibility of our proposal: the combination of hardware-/software-based AVs for practical and effective malware detection that flags suspicious software while posing negligible performance overhead.}, keywords = {antivirus, Branch prediction, malware, Performance, Signatures}, pubstate = {published}, tppubtype = {article} } @inproceedings{10.1007/978-3-031-22390-7_22, title = {Why We Need a Theory of Maliciousness: Hardware Performance Counters in Security}, author = {Marcus Botacin and André Grégio}, editor = {Willy Susilo and Xiaofeng Chen and Fuchun Guo and Yudi Zhang and Rolly Intan}, url = {https://link.springer.com/chapter/10.1007/978-3-031-22390-7_22 secret.inf.ufpr.br/papers/isc_hpc_short.pdf https://secret.inf.ufpr.br/papers/isc_hpc_extended.pdf}, isbn = {978-3-031-22390-7}, year = {2022}, date = {2022-01-01}, booktitle = {Information Security}, pages = {381--389}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {Hardware Performance Counters (HPCs) are at the center of a research discussion: Is their use effective for malware detection? In this paper, we try to clarify the discussion by evaluating prior work presenting HPC criticism and highlighting their implicit assumptions and the potential research opportunities created by them. We discovered that HPCs are particularly good at detecting malware that exploits architectural side-effects, but not as good as traditional detection approaches at detecting pure-software malware, such that detection approaches must be combined. We also identified that most of the controversy about HPCs originates from researchers not clearly stating which type of malware they were considering. Thus, we claim the need for a theory of maliciousness to better state malware threats and evaluate proposed defenses.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{10.1007/978-3-031-22390-7_20, title = {Dissecting Applications Uninstallers and Removers: Are They Effective?}, author = {Marcus Botacin and André Grégio}, editor = {Willy Susilo and Xiaofeng Chen and Fuchun Guo and Yudi Zhang and Rolly Intan}, url = {https://link.springer.com/chapter/10.1007/978-3-031-22390-7_20 https://secret.inf.ufpr.br/papers/isc_uninstallers.pdf}, isbn = {978-3-031-22390-7}, year = {2022}, date = {2022-01-01}, booktitle = {Information Security}, pages = {339--359}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {Developing a safe application is so important as to properly install it in a system, and not an application's tampered version. In a similar note, developers should properly care about applications' uninstall process to avoid leaving traces of sensitive data behind in the system or interfere with the remaining applications. Until now, the academic literature has paid little attention to uninstall procedures so far. Moreover, a whole ecosystem of application uninstallers has been created, making multiple uninstallers available in software repositories. A key point is to understand how these applications work so as to develop stronger systems. To this end, we present a landscape work evaluating the operation of the 11 most downloaded uninstaller applications from the three most popular Internet software repositories. We discovered that most of these applications are not very different from the native Windows uninstaller. Although evaluated uninstallers present a more organized User Interface, thus enhancing usability, they are only able to find the same installed application as the native Windows uninstaller, but not broken installations. Few uninstallers apply heuristics to find broken application installations. However, we show that those heuristics can be abused by attackers to remove third applications. Finally, we also show that none of the removers is resistant to malicious uninstallers that terminate the remover process.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{10.1145/3429741, title = {One Size Does Not Fit All: A Longitudinal Analysis of Brazilian Financial Malware}, author = {Marcus Botacin and Hojjat Aghakhani and Stefano Ortolani and Christopher Kruegel and Giovanni Vigna and Daniela Oliveira and Paulo Lício De Geus and André Grégio}, url = {https://doi.org/10.1145/3429741 https://secret.inf.ufpr.br/papers/marcus_tops_br.pdf}, doi = {10.1145/3429741}, issn = {2471-2566}, year = {2021}, date = {2021-01-01}, journal = {ACM Trans. Priv. Secur.}, volume = {24}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, abstract = {Malware analysis is an essential task to understand infection campaigns, the behavior of malicious codes, and possible ways to mitigate threats. Malware analysis also allows better assessment of attackers’ capabilities, techniques, and processes. Although a substantial amount of previous work provided a comprehensive analysis of the international malware ecosystem, research on regionalized, country-, and population-specific malware campaigns have been scarce. Moving towards addressing this gap, we conducted a longitudinal (2012-2020) and comprehensive (encompassing an entire population of online banking users) study of MS Windows desktop malware that actually infected Brazilian banks’ users. We found that the Brazilian financial desktop malware has been evolving quickly: it started to make use of a variety of file formats instead of typical PE binaries, relied on native system resources, and abused obfuscation techniques to bypass detection mechanisms. Our study on the threats targeting a significant population on the ecosystem of the largest and most populous country in Latin America can provide invaluable insights that may be applied to other countries’ user populations, especially those in the developing world that might face cultural peculiarities similar to Brazil’s. With this evaluation, we expect to motivate the security community/industry to seriously consider a deeper level of customization during the development of next-generation anti-malware solutions, as well as to raise awareness towards regionalized and targeted Internet threats.}, keywords = {banking, malware, reverse engineer}, pubstate = {published}, tppubtype = {article} } @article{BOTACIN2021102287, title = {Challenges and Pitfalls in Malware Research}, author = {Marcus Botacin and Fabricio Ceschin and Ruimin Sun and Daniela Oliveira and André Grégio}, url = {https://www.sciencedirect.com/science/article/pii/S0167404821001115 https://secret.inf.ufpr.br/papers/marcus_challenges.pdf}, doi = {https://doi.org/10.1016/j.cose.2021.102287}, issn = {0167-4048}, year = {2021}, date = {2021-01-01}, journal = {Computers & Security}, pages = {102287}, abstract = {As the malware research field became more established over the last two decades, new research questions arose, such as how to make malware research reproducible, how to bring scientific rigor to attack papers, or what is an appropriate malware dataset for relevant experimental results. The challenges these questions pose also brings pitfalls that affect the multiple malware research stakeholders. To help answering those questions and to highlight potential research pitfalls to be avoided, in this paper, we present a systematic literature review of 491 papers on malware research published in major security conferences between 2000 and 2018. We identified the most common pitfalls present in past literature and propose a method for assessing current (and future) malware research. Our goal is towards integrating science and engineering best practices to develop further, improved research by learning from issues present in the published body of work. As far as we know, this is the largest literature review of its kind and the first to summarize research pitfalls in a research methodology that avoids them. In total, we discovered 20 pitfalls that limit current research impact and reproducibility. The identified pitfalls range from (i) the lack of a proper threat model, that complicates paper’s evaluation, to (ii) the use of closed-source solutions and private datasets, that limit reproducibility. We also report yet-to-be-overcome challenges that are inherent to the malware nature, such as non-deterministic analysis results. Based on our findings, we propose a set of actions to be taken by the malware research and development community for future work: (i) Consolidation of malware research as a field constituted of diverse research approaches (e.g., engineering solutions, offensive research, landscapes/observational studies, and network traffic/system traces analysis); (ii) design of engineering solutions with clearer, direct assumptions (e.g., positioning solutions as proofs-of-concept vs. deployable); (iii) design of experiments that reflects (and emphasizes) the target scenario for the proposed solution (e.g., corporation, user, country-wide); (iv) clearer exposition and discussion of limitations of used technologies and exercised norms/standards for research (e.g., the use of closed-source antiviruses as ground-truth).}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{BOTACIN2021301220, title = {Understanding uses and misuses of similarity hashing functions for malware detection and family clustering in actual scenarios}, author = {Marcus Botacin and Vitor Hugo Galhardo Moia and Fabricio Ceschin and Marco A Amaral Henriques and André Grégio}, url = {https://www.sciencedirect.com/science/article/pii/S2666281721001281 https://secret.inf.ufpr.br/papers/marcus_similarity_hashing.pdf}, doi = {https://doi.org/10.1016/j.fsidi.2021.301220}, issn = {2666-2817}, year = {2021}, date = {2021-01-01}, journal = {Forensic Science International: Digital Investigation}, volume = {38}, pages = {301220}, abstract = {An everyday growing number of malware variants target end-users and organizations. To reduce the amount of individual malware handling, security analysts apply techniques for finding similarities to cluster samples. A popular clustering method relies on similarity hashing functions, which create short representations of files and compare them to produce a score related to the similarity level between them. Despite the popularity of those functions, the limits of their application to malware samples have not been extensively studied so-far. To help in bridging this gap, we performed a set of experiments to characterize the application of these functions on long-term, realistic malware analysis scenarios. To do so, we introduce SHAVE, an ideal model of similarity hashing-based antivirus engine. The evaluation of SHAVE consisted of applying two distinct hash functions (ssdeep and sdhash) to a dataset of 21 thousand actual malware samples collected over four years. We characterized this dataset based on the performed clustering, and discovered that: (i) smaller groups are prevalent than large ones; (ii) the threshold value chosen may significantly change the conclusions about the prevalence of similar samples in a given dataset; (iii) establishing a ground-truth for similarity hashing functions comparison has its issues, since the clusters originated from traditional AV labeling routines may result from a completely distinct approach; (iv) the application of similarity hashing functions improves traditional AVs’ detection rates by up to 40%; and finally (v) taking specific binary regions into account (e.g., instructions), leads to better classification results than hashing the entire binary file.}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{BOTACIN2021102500, title = {AntiViruses under the Microscope: A Hands-On Perspective}, author = {Marcus Botacin and Felipe Duarte Domingues and Fabrício Ceschin and Raphael Machnicki and Marco Antonio Zanata Alves and Paulo Lício de Geus and André Grégio}, url = {https://www.sciencedirect.com/science/article/pii/S0167404821003242 https://secret.inf.ufpr.br/papers/marcus_av_handson.pdf}, doi = {https://doi.org/10.1016/j.cose.2021.102500}, issn = {0167-4048}, year = {2021}, date = {2021-01-01}, journal = {Computers & Security}, pages = {102500}, abstract = {AntiViruses (AVs) are the main defense line against attacks for most users and much research has been done about them, especially proposing new detection procedures that work in academic prototypes. However, as most current and commercial AVs are closed-source solutions, in practice, little is known about their real internals: information such as what is a typical AV database size, the detection methods effectively used in each operation mode, and how often on average the AVs are updated are still unknown. This prevents research work from meeting the industrial practices more thoroughly. To fill this gap, in this work, we systematize the knowledge about AVs. To do so, we first surveyed the literature and identified existing knowledge gaps in AV internals’ working. Further, we bridged these gaps by analyzing popular (Windows, Linux, and Android) AV solutions to check their operations in practice. Our methodology encompassed multiple techniques, from tracing to fuzzing. We detail current AV’s architecture, including their multiple components, such as browser extensions and injected libraries, regarding their implementation, monitoring features, and self-protection capabilities. We discovered, for instance, a great disparity in the set of API functions hooked by the distinct AV’s libraries, which might have a significant impact in the viability of academically-proposed detection models (e.g., machine learning-based ones).}, keywords = {}, pubstate = {published}, tppubtype = {article} } @inproceedings{10.1145/3433667.3433669, title = {No Need to Teach New Tricks to Old Malware: Winning an Evasion Challenge with XOR-Based Adversarial Samples}, author = {Fabricio Ceschin and Marcus Botacin and Gabriel Lüders and Heitor Murilo Gomes and Luiz Oliveira and Andre Gregio}, url = {https://doi.org/10.1145/3433667.3433669 https://secret.inf.ufpr.br/papers/roots_mlsec20.pdf}, doi = {10.1145/3433667.3433669}, isbn = {9781450389747}, year = {2020}, date = {2020-11-01}, booktitle = {Reversing and Offensive-Oriented Trends Symposium}, pages = {13–22}, publisher = {Association for Computing Machinery}, address = {Vienna, Austria}, series = {ROOTS'20}, abstract = {Adversarial attacks to Machine Learning (ML) models became such a concern that tech companies (Microsoft and CUJO AI’s Vulnerability Research Lab) decided to launch contests to better understand their impact on practice. During the contest’s first edition (2019), participating teams were challenged to bypass three ML models in a white box manner. Our team bypassed all the three of them and reported interesting insights about models’ weaknesses. In the second edition (2020), the challenge evolved to an attack-and-defense model: the teams should either propose defensive models and attack other teams’ models in a black box manner. Despite the difficulty increase, our team was able to bypass all models again. In this paper, we describe our insights for this year’s contest regarding on attacking models, as well defending them from adversarial attacks. In particular, we show how frequency-based models (e.g., TF-IDF) are vulnerable to the addition of dead function imports, and how models based on raw bytes are vulnerable to payload-embedding obfuscation (e.g., XOR and base64 encoding).}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{BOTACIN2020101859, title = {We Need to Talk About AntiViruses: Challenges & Pitfalls of AV Evaluations}, author = {Marcus Botacin and Fabricio Ceschin and Paulo de Geus and André Grégio}, url = {http://www.sciencedirect.com/science/article/pii/S0167404820301310 https://secret.inf.ufpr.br/papers/marcus_av.pdf}, doi = {https://doi.org/10.1016/j.cose.2020.101859}, issn = {0167-4048}, year = {2020}, date = {2020-04-29}, journal = {Computers & Security}, pages = {101859}, abstract = {Security evaluation is an essential task to identify the level of protection accomplished in running systems or to aid in choosing better solutions for each specific scenario. Although antiviruses (AVs) are one of the main defensive solutions for most end-users and corporations, AV’s evaluations are conducted by few organizations and often limited to compare detection rates. Moreover, other important factors of AVs’ operating mode (e.g., response time and detection regression) are usually underestimated. Ignoring such factors create an “understanding gap” on the effectiveness of AVs in actual scenarios, which we aim to bridge by presenting a broader characterization of current AVs’ modes of operation. In our characterization, we consider distinct file types, operating systems, datasets, and time frames. To do so, we daily collected samples from two distinct, representative malware sources and submitted them to the VirusTotal (VT) service for 30 consecutive days. In total, we considered 28,875 unique malware samples. For each day, we retrieved the submitted samples’ detection rates and assigned labels, resulting in more than 1M distinct VT submissions overall. Our experimental results show that: (i) phishing contexts are a challenge for all AVs, turning malicious Web pages detectors less effective than malicious files detectors; (ii) generic procedures are insufficient to ensure broad detection coverage, incurring in lower detection rates for particular datasets (e.g., country-specific) than for those with world-wide collected samples; (iii) detection rates are unstable since all AVs presented detection regression effects after scans in different time frames using the same dataset and (iv) AVs’ long response times in delivering new signatures/heuristics create a significant attack opportunity window within the first 30 days after we first identified a malicious binary. To address the effects of our findings, we propose six new metrics to evaluate the multiple aspects that impact the effectiveness of AVs. With them, we hope to assess corporate (and domestic) users to better evaluate the solutions that fit their needs more adequately.}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{Botacin2020, title = {Leveraging branch traces to understand kernel internals from within}, author = {Marcus Botacin and Paulo Lício de Geus and André Grégio}, url = {https://doi.org/10.1007/s11416-019-00343-w https://secret.inf.ufpr.br//papers/reverse_kernel_marcus.pdf}, doi = {10.1007/s11416-019-00343-w}, issn = {2263-8733}, year = {2020}, date = {2020-01-02}, journal = {Journal of Computer Virology and Hacking Techniques}, abstract = {Kernel monitoring is often a hard task, requiring external debuggers and/or modules to be successfully performed. These requirements make analysis procedures more complicated because multiple machines, although virtualized ones, are required. This requirements also make analysis procedures more expensive. In this paper, we present the Lightweight Kernel Tracer (LKT), an alternative solution for tracing kernel from within by leveraging branch monitors for data collection and an address-based introspection procedure for context reconstruction. We evaluated LKT by tracing distinct machines powered by x64 Windows kernels and show that LKT may be used for understanding kernel's internals (e.g., graphics and USB subsystems) and for system profiling. We also show how to use LKT to trace other tracing and monitoring mechanisms running in kernel, such as Antiviruses and Sandboxes.}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{Botacin2020b, title = {The self modifying code (SMC)-aware processor (SAP): a security look on architectural impact and support}, author = {Marcus Botacin and Marco Zanata and André Grégio}, url = {https://doi.org/10.1007/s11416-020-00348-w https://secret.inf.ufpr.br/papers/SMC_marcus.pdf}, doi = {10.1007/s11416-020-00348-w}, issn = {2263-8733}, year = {2020}, date = {2020-01-01}, journal = {Journal of Computer Virology and Hacking Techniques}, abstract = {Self modifying code (SMC) are code snippets that modify themselves at runtime. Malware use SMC to hide payloads and achieve persistence. Software-based SMC detection solutions impose performance penalties for real-time monitoring and do not benefit from runtime architectural information (cache invalidation or pipeline flush, for instance). We revisit SMC impact on hardware internals and discuss the implementation of an SMC detector at distinct architectural points. We consider three detection approaches: (i) existing hardware counters; (ii) block invalidation by the cache coherence protocol; (iii) the use of Memory Management Unit (MMU) information to control SMC execution. We compare the identified instrumentation points to highlight their strong and weak points. We also compare them to previous SMC detectors' implementations.}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{9061034, title = {A Praise for Defensive Programming: LeveragingUncertainty for Effective Malware Mitigation}, author = {R Sun and M Botacin and N Sapountzis and X Yuan and M Bishop and D E Porter and X Li and A Gregio and D Oliveira}, url = {https://ieeexplore.ieee.org/document/9061034 https://secret.inf.ufpr.br/papers/chameleon.pdf}, year = {2020}, date = {2020-01-01}, journal = {IEEE Transactions on Dependable and Secure Computing}, pages = {1-1}, keywords = {}, pubstate = {published}, tppubtype = {article} } @conference{10.1007/978-3-030-52683-2_10b, title = {On the Security of Application Installers and Online Software Repositories}, author = {Marcus Botacin and Giovanni Bert{~a}o and Paulo de Geus and André Grégio and Christopher Kruegel and Giovanni Vigna}, editor = {Clémentine Maurice and Leyla Bilge and Gianluca Stringhini and Nuno Neves}, url = {https://link.springer.com/chapter/10.1007/978-3-030-52683-2_10 https://secret.inf.ufpr.br/papers/marcus_dimva_bundle.pdf}, isbn = {978-3-030-52683-2}, year = {2020}, date = {2020-01-01}, booktitle = {Detection of Intrusions and Malware, and Vulnerability Assessment}, pages = {192--214}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {The security of application installers is often overlooked, but the security risks associated to these pieces of code are not negligible. Online public repositories have been one of the most popular ways for end users to obtain software, but there is a lack of systematic security evaluation of popular public repositories. In this paper, we bridge this gap by analyzing five popular software repositories. We focus on their software updating dynamics, as well as the presence of traces of vulnerable and/or trojanized applications among the top-100 most downloaded Windows programs on each of the evaluated repositories. We analyzed 2,935 unique programs collected in a period of 144 consecutive days. Our results show that: (i) the repositories frequently exhibit rank changes due to applications fast climbing toward the first positions; (ii) the repositories often update their payloads, which may cause the distribution of distinct binaries for the same intended application (binaries for the same applications may also be different in each repository); (iii) the installers are composed by multiple components and often download payloads from the Internet to complete their installation steps, posing new risks for users (we demonstrate that some installers are vulnerable to content tampering through man-in-the-middle attacks); (iv) the ever-changing nature of repositories and installers makes them prone to abuse, as we observed that 30% of all applications were reported malicious by at least one AV.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } @inproceedings{10.1145/3422575.3422775, title = {Near-Memory & In-Memory Detection of Fileless Malware}, author = {Marcus Botacin and André Grégio and Marco Antonio Zanata Alves}, url = {https://doi.org/10.1145/3422575.3422775 https://secret.inf.ufpr.br/papers/marcus_fileless.pdf}, doi = {10.1145/3422575.3422775}, isbn = {9781450388993}, year = {2020}, date = {2020-01-01}, booktitle = {The International Symposium on Memory Systems}, pages = {23–38}, publisher = {Association for Computing Machinery}, address = {Washington, DC, USA}, series = {MEMSYS 2020}, abstract = {Fileless malware are recent threats to computer systems that load directly into memory, and whose aim is to prevent anti-viruses (AVs) from successfully matching byte patterns against suspicious files written on disk. Their detection requires that software-based AVs continuously scan memory, which is expensive due to repeated locks and polls. However, research advances introduced near-memory and in-memory processing, which allow memory controllers to trigger basic computations without moving data to the CPU. In this paper, we address AVs performance overhead by moving them to the hardware, i.e., we propose instrumenting processors’ memory controller or smart memories (near- and in-memory malware detection, respectively) to accelerate memory scanning procedures. To do so, we present MINI-ME, the Malware Identification based on Near- and In-Memory Evaluation mechanism, a hardware-based AV accelerator that interrupts the program’s execution if malicious patterns are discovered in their memory. We prototyped MINI-ME in a simulator and tested it with a set of 21 thousand in-the-wild malware samples, which resulted in multiple signatures matching with less than 1% of performance overhead and rates of 100% detection, and zero false-positives and false-negatives.}, keywords = {antivirus, malware, pattern matching, processing in memory}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{10.1145/3375894.3375895, title = {RevEngE is a Dish Served Cold: Debug-Oriented Malware Decompilation and Reassembly}, author = {Marcus Botacin and Lucas Galante and Paulo de Geus and André Grégio}, url = {https://doi.org/10.1145/3375894.3375895 https://secret.inf.ufpr.br/papers/roots_revenge.pdf}, doi = {10.1145/3375894.3375895}, isbn = {9781450377751}, year = {2019}, date = {2019-11-28}, booktitle = {Proceedings of the 3rd Reversing and Offensive-Oriented Trends Symposium}, publisher = {Association for Computing Machinery}, address = {Vienna, Austria}, series = {ROOTS’19}, abstract = {Malware analysis is key for cybersecurity overall improvement. Analysis tools have been evolving from complete static analyzers to decompilers. Malware decompilation allows for code inspection at higher abstraction levels, easing incident response. However, the decompilation procedure has many challenges, such as opaque constructions, irreversible mappings, semantic gap bridging, among others. In this paper, we propose a new approach that leverages the human analyst expertise to overcome decompilation challenges. We name this approach "DoD---debug-oriented decompilation", in which the analyst is able to reverse engineer the malware sample on his own and to instruct the decompiler to translate selected code portions (e.g., decision branches, fingerprinting functions, payloads etc.) into high level code. With DoD, the analyst might group all decompiled pieces into new code to be analyzed by other tool, or to develop a novel malware sample from previous pieces of code and thus exercise a Proof-of-Concept (PoC). To validate our approach, we propose RevEngE, the Reverse Engineering Engine for malware decompilation and reassembly, a set of GDB extensions that intercept and introspect into executed functions to build an Intermediate Representation (IR) in real-time, enabling any-time decompilation. We evaluate RevEngE with x86 ELF binaries collected from VirusShare, and show that a new malware sample created from the decompilation of independent functions of five known malware samples is considered "clean" by all VirusTotal's AVs.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{10.1145/3375894.3375898, title = {Shallow Security: On the Creation of Adversarial Variants to Evade Machine Learning-Based Malware Detectors}, author = {Fabrício Ceschin and Marcus Botacin and Heitor Murilo Gomes and Luiz S Oliveira and André Grégio}, url = {https://doi.org/10.1145/3375894.3375898 https://secret.inf.ufpr.br/papers/roots_shallow.pdf}, doi = {10.1145/3375894.3375898}, isbn = {9781450377751}, year = {2019}, date = {2019-11-28}, booktitle = {Proceedings of the 3rd Reversing and Offensive-Oriented Trends Symposium}, publisher = {Association for Computing Machinery}, address = {Vienna, Austria}, series = {ROOTS’19}, abstract = {The use of Machine Learning (ML) techniques for malware detection has been a trend in the last two decades. More recently, researchers started to investigate adversarial approaches to bypass these ML-based malware detectors. Adversarial attacks became so popular that a large Internet company has launched a public challenge to encourage researchers to bypass their (three) ML-based static malware detectors. Our research group teamed to participate in this challenge in August/2019, accomplishing the bypass of all 150 tests proposed by the company. To do so, we implemented an automatic exploitation method which moves the original malware binary sections to resources and includes new chunks of data to it to create adversarial samples that not only bypassed their ML detectors, but also real AV engines as well (with a lower detection rate than the original samples). In this paper, we detail our methodological approach to overcome the challenge and report our findings. With these results, we expect to contribute with the community and provide better understanding on ML-based detectors weaknesses. We also pinpoint future research directions toward the development of more robust malware detectors against adversarial machine learning.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @article{Botacin2019, title = {``VANILLA'' malware: vanishing antiviruses by interleaving layers and layers of attacks}, author = {Marcus Botacin and Paulo Lício de Geus and André Grégio}, url = {https://secret.inf.ufpr.br/papers/marcus-vanilla.pdf https://doi.org/10.1007/s11416-019-00333-y}, doi = {10.1007/s11416-019-00333-y}, issn = {2263-8733}, year = {2019}, date = {2019-06-11}, journal = {Journal of Computer Virology and Hacking Techniques}, abstract = {Malware are persistent threats to any networked systems. Recent years increase in multi-core, distributed systems created new opportunities for malware authors to exploit such capabilities. In particular, the distributed execution of a malware in multiple cores may be used to evade currently widespread single-core-based detectors (e.g., antiviruses, or AVs) and malware analysis solutions that are unable to correlate data from multiple sources. In this paper, we propose a technique for distributing the malware functions in several distinct ``vanilla'' processes to show that AVs can be easily evaded. Therefore, our technique allows malware to interleave of layers of attacks to remain undetected by current AVs. Our goal is to expose a real menace and to discuss it so as to provide insights for the development of better AVs. We discuss the role of distributed and multicore-based malware in current and future threat scenarios with practical examples that we specially crafted for testing (e.g., a distributed sample synchronized via cache side channels). We (i) review multi-threaded/processed implementation issues (from kernel and userland) and present a multi-core-based monitoring solution; (ii) present strategies for code distribution, exemplified via DLL injectors, and discuss their weak and strong points; and (iii) evaluate how real security solutions perform when exposed to distributed malware. We converted real, serial malware to parallel code and showed that current AVs are not fully able to detect multi-core malware.}, keywords = {}, pubstate = {published}, tppubtype = {article} } @conference{recosoc, title = {The AV says: Your hardware definitions were updated!}, author = {Marcus Botacin and Lucas Galante and Fabricio Ceschin and Luigi Carro Paulo Cesar Santos and Paulo Licio de Geus and Andre Gregio and Marco Zanata}, url = {https://ieeexplore.ieee.org/document/9034972 https://secret.inf.ufpr.br/papers/marcus_recosoc.pdf}, doi = {10.1109/ReCoSoC48741.2019.9034972}, isbn = {978-1-7281-4770-3}, year = {2019}, date = {2019-01-01}, booktitle = {14th International Symposium on Reconfigurable Communication-centric Systems-on-Chip (ReCoSoC 2019)}, journal = {14th International Symposium on Reconfigurable Communication-centric Systems-on-Chip (ReCoSoC 2019)}, publisher = {IEEE}, howpublished = {urlhttps://secret.inf.ufpr.br/papers/marcus_recosoc.pdf}, keywords = {}, pubstate = {published}, tppubtype = {conference} } @inproceedings{Botacin:2019:IBI:3339252.3340103, title = {The Internet Banking [in]Security Spiral: Past, Present, and Future of Online Banking Protection Mechanisms Based on a Brazilian Case Study}, author = {Marcus Botacin and Anatoli Kalysch and André Grégio}, url = {http://doi.acm.org/10.1145/3339252.3340103 https://secret.inf.ufpr.br/papers/marcus_banks.pdf}, doi = {10.1145/3339252.3340103}, isbn = {978-1-4503-7164-3}, year = {2019}, date = {2019-01-01}, booktitle = {Proceedings of the 14th International Conference on Availability, Reliability and Security}, pages = {49:1--49:10}, publisher = {ACM}, address = {Canterbury, CA, United Kingdom}, series = {ARES '19}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{10.1007/978-3-030-30215-3_19, title = {L(a)ying in (Test)Bed: How Biased Datasets Produce Impractical Results for Actual Malware Families’ Classification}, author = {Tamy Beppler and Marcus Botacin and Fabrício Ceschin and Luiz E S Oliveira and André Grégio}, editor = {Zhiqiang Lin and Charalampos Papamanthou and Michalis Polychronakis}, url = {https://link.springer.com/chapter/10.1007/978-3-030-30215-3_19 https://secret.inf.ufpr.br//papers/malware_textures_tamy.pdf}, isbn = {978-3-030-30215-3}, year = {2019}, date = {2019-01-01}, booktitle = {Information Security}, pages = {381--401}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {The number of malware variants released daily turned manual analysis into an impractical task. Although potentially faster, automated analysis techniques (e.g., static and dynamic) have shortcomings that are exploited by malware authors to thwart each of them, i.e., prevent malicious software from being detected or classified accordingly. Researchers then invested in traditional machine learning algorithms to try to produce efficient, effective classification methods. The produced models are also prone to errors and attacks. Novel representations of the ``subject'' were proposed to overcome previous limitations, such as malware textures. In this paper, our initial proposal was to evaluate the application of texture analysis for malware classification using samples collected in-the-wild in order to compare them with state-of-the-art results. During our tests, we discovered that texture analysis may be unfeasible for the task at hand, if we use the same malware representation employed by other authors. Furthermore, we also discovered that naive premises associated to the selection of samples in the datasets caused the introduction of biases that, in the end, produced unreal results. Finally, our tests with a broader unfiltered dataset show that texture analysis may be impractical for correct malware classification in a real world scenario, in which there is a great variety of families and some of them make use of quite sophisticate obfuscation techniques.}, keywords = {learning (artificial intelligence)}, pubstate = {published}, tppubtype = {inproceedings} } @article{8636415, title = {The Need for Speed: An Analysis of Brazilian Malware Classifiers}, author = {Fabrício Ceschin and Felipe Pinage and Marcos Castilho and David Menotti and Luis S Oliveira and André Gregio}, url = {https://secret.inf.ufpr.br/papers/fabricio_needforspeed.pdf}, doi = {10.1109/MSEC.2018.2875369}, issn = {1540-7993}, year = {2018}, date = {2018-11-01}, journal = {IEEE Security Privacy}, volume = {16}, number = {6}, pages = {31-41}, abstract = {Using a dataset containing about 50,000 samples from Brazilian cyberspace, we show that relying solely on conventional machine-learning systems without taking into account the change of the subject's concept decreases the performance of classification, emphasizing the need to update the decision model immediately after concept drift occurs.}, keywords = {Brazilian malware classifers, Feature extraction, invasive software, learning (artificial intelligence), Machine learning, machine-learning systems, malware, malware classification, pattern classification, security, Security of data, Support vector machines}, pubstate = {published}, tppubtype = {article} } @article{Botacin2018, title = {The other guys: automated analysis of marginalized malware}, author = {Marcus Botacin and Paulo Lício de Geus and André Grégio}, url = {https://secret.inf.ufpr.br/papers/behemot.pdf https://doi.org/10.1007/s11416-017-0292-8}, doi = {10.1007/s11416-017-0292-8}, issn = {2263-8733}, year = {2018}, date = {2018-02-01}, journal = {Journal of Computer Virology and Hacking Techniques}, volume = {14}, number = {1}, pages = {87--98}, abstract = {In order to thwart dynamic analysis and bypass protection mechanisms, malware have been using several file formats and evasive techniques. While publicly available dynamic malware analysis systems are one of the main sources of information for researchers, security analysts and incident response professionals, they are unable to cope with all types of threats. Therefore, it is difficult to gather information from public systems about CPL, .NET/Mono, 64-bits, reboot-dependent, or malware targeting systems newer than Windows XP, which result in a lack of understanding about how current malware behave during infections on modern operating systems. In this paper, we discuss the challenges and issues faced during the development of this type of analysis system, mainly due to security features available in NT 6.x kernel versions of Windows OS. We also introduce a dynamic analysis system that addresses the aforementioned types of malware as well as present results obtained from their analyses.}, keywords = {}, pubstate = {published}, tppubtype = {article} } @article{Botacin:2018:WWS:3236632.3199673, title = {Who Watches the Watchmen: A Security-focused Review on Current State-of-the-art Techniques, Tools, and Methods for Systems and Binary Analysis on Modern Platforms}, author = {Marcus Botacin and Paulo Lício De Geus and André Grégio}, url = {https://secret.inf.ufpr.br/papers/marcus-survey.pdf http://doi.acm.org/10.1145/3199673}, doi = {10.1145/3199673}, issn = {0360-0300}, year = {2018}, date = {2018-01-01}, journal = {ACM Comput. Surv.}, volume = {51}, number = {4}, pages = {69:1--69:34}, publisher = {ACM}, address = {New York, NY, USA}, keywords = {Binary analysis, HVM, introspection, malware, security, SMM}, pubstate = {published}, tppubtype = {article} } @article{Botacin:2018:EBM:3171591.3152162, title = {Enhancing Branch Monitoring for Security Purposes: From Control Flow Integrity to Malware Analysis and Debugging}, author = {Marcus Botacin and Paulo Lício De Geus and André Grégio}, url = {https://secret.inf.ufpr.br/papers/marcus-branch.pdf http://doi.acm.org/10.1145/3152162}, doi = {10.1145/3152162}, issn = {2471-2566}, year = {2018}, date = {2018-01-01}, journal = {ACM Trans. Priv. Secur.}, volume = {21}, number = {1}, pages = {4:1--4:30}, publisher = {ACM}, address = {New York, NY, USA}, keywords = {branch monitor, debug, malware, ROP}, pubstate = {published}, tppubtype = {article} } @inproceedings{10.1007/978-3-319-99136-8_3, title = {Lumus: Dynamically Uncovering Evasive Android Applications}, author = {Vitor Afonso and Anatoli Kalysch and Tilo Müller and Daniela Oliveira and André Grégio and Paulo Lício de Geus}, editor = {Liqun Chen and Mark Manulis and Steve Schneider}, url = {https://secret.inf.ufpr.br/papers/lumus.pdf}, isbn = {978-3-319-99136-8}, year = {2018}, date = {2018-01-01}, booktitle = {Information Security}, pages = {47--66}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {Dynamic analysis of Android malware suffers from techniques that identify the analysis environment and prevent the malicious behavior from being observed. While there are many analysis solutions that can thwart evasive malware on Windows, the application of similar techniques for Android has not been studied in-depth. In this paper, we present Lumus, a novel technique to uncover evasive malware on Android. Lumus compares the execution traces of malware on bare metal and emulated environments. We used Lumus to analyze 1,470 Android malware samples and were able to uncover 192 evasive samples. Comparing our approach with other solutions yields better results in terms of accuracy and false positives. We discuss which information are typically used by evasive malware for detecting emulated environments, and conclude on how analysis sandboxes can be strengthened in the future.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } @inproceedings{8073803, title = {The dose makes the poison — Leveraging uncertainty for effective malware detection}, author = {R Sun and X Yuan and A Lee and M Bishop and D E Porter and X Li and André Grégio and Daniela Oliveira}, doi = {10.1109/DESEC.2017.8073803}, year = {2017}, date = {2017-08-01}, booktitle = {2017 IEEE Conference on Dependable and Secure Computing}, pages = {123-130}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} }