2023
|
Pimenta, Thalita Scharr Rodrigues; Ceschin, Fabricio; Gregio, Andre ANDROIDGYNY: Reviewing Clustering Techniques for Android Malware Family Classification Journal Article Digital Threats, 2023, ISSN: 2692-1626, (Just Accepted). Abstract | Links | BibTeX @article{10.1145/3587471,
title = {ANDROIDGYNY: Reviewing Clustering Techniques for Android Malware Family Classification},
author = {Thalita Scharr Rodrigues Pimenta and Fabricio Ceschin and Andre Gregio},
url = {https://doi.org/10.1145/3587471
https://secret.inf.ufpr.br/papers/androidgyny_thalita.pdf},
doi = {10.1145/3587471},
issn = {2692-1626},
year = {2023},
date = {2023-03-01},
journal = {Digital Threats},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {Thousands of malicious applications (apps) are daily created, modified with the aid of automation tools, and released on the World Wide Web. Several techniques have been applied over the years to identify whether an APK is malicious or not. The use of these techniques intends to identify unknown malware mainly by calculating the similarity of a sample with previously grouped, already known families of malicious apps. Thus, high rates of accuracy would enable several countermeasures: from further quick detection to the development of vaccines and aid for reverse engineering new variants. However, most of the literature consists of limited experiments—either short-term and offline or based exclusively on well-known malicious apps’ families. In this paper, we explore the use of malware phylogeny, a term borrowed from biology, consisting of the genealogical study of the relationship between elements and families. Also, we investigate the literature on clustering techniques applied to mobile malware classification and discuss how researchers have been setting up their experiments.},
note = {Just Accepted},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Thousands of malicious applications (apps) are daily created, modified with the aid of automation tools, and released on the World Wide Web. Several techniques have been applied over the years to identify whether an APK is malicious or not. The use of these techniques intends to identify unknown malware mainly by calculating the similarity of a sample with previously grouped, already known families of malicious apps. Thus, high rates of accuracy would enable several countermeasures: from further quick detection to the development of vaccines and aid for reverse engineering new variants. However, most of the literature consists of limited experiments—either short-term and offline or based exclusively on well-known malicious apps’ families. In this paper, we explore the use of malware phylogeny, a term borrowed from biology, consisting of the genealogical study of the relationship between elements and families. Also, we investigate the literature on clustering techniques applied to mobile malware classification and discuss how researchers have been setting up their experiments. |
2022
|
Ceschin, Fabrício; Botacin, Marcus; Gomes, Heitor Murilo; Pinagé, Felipe; Oliveira, Luiz S; Grégio, André Fast & Furious: On the modelling of malware detection as an evolving data stream Journal Article Expert Systems with Applications, pp. 118590, 2022, ISSN: 0957-4174. Abstract | Links | BibTeX @article{CESCHIN2022118590,
title = {Fast & Furious: On the modelling of malware detection as an evolving data stream},
author = {Fabrício Ceschin and Marcus Botacin and Heitor Murilo Gomes and Felipe Pinagé and Luiz S Oliveira and André Grégio},
url = {https://www.sciencedirect.com/science/article/pii/S0957417422016463
https://secret.inf.ufpr.br/papers/fabricio_eswa_22.pdf},
doi = {https://doi.org/10.1016/j.eswa.2022.118590},
issn = {0957-4174},
year = {2022},
date = {2022-08-22},
journal = {Expert Systems with Applications},
pages = {118590},
abstract = {Malware is a major threat to computer systems and imposes many challenges to cyber security. Targeted threats, such as ransomware, cause millions of dollars in losses every year. The constant increase of malware infections has been motivating popular antiviruses (AVs) to develop dedicated detection strategies, which include meticulously crafted machine learning (ML) pipelines. However, malware developers unceasingly change their samples’ features to bypass detection. This constant evolution of malware samples causes changes to the data distribution (i.e., concept drifts) that directly affect ML model detection rates, something not considered in the majority of the literature work. In this work, we evaluate the impact of concept drift on malware classifiers for two Android datasets: DREBIN (≈130K apps) and a subset of AndroZoo (≈285K apps). We used these datasets to train an Adaptive Random Forest (ARF) classifier, as well as a Stochastic Gradient Descent (SGD) classifier. We also ordered all datasets samples using their VirusTotal submission timestamp and then extracted features from their textual attributes using two algorithms (Word2Vec and TF-IDF). Then, we conducted experiments comparing both feature extractors, classifiers, as well as four drift detectors (Drift Detection Method, Early Drift Detection Method, ADaptive WINdowing, and Kolmogorov–Smirnov WINdowing) to determine the best approach for real environments. Finally, we compare some possible approaches to mitigate concept drift and propose a novel data stream pipeline that updates both the classifier and the feature extractor. To do so, we conducted a longitudinal evaluation by (i) classifying malware samples collected over nine years (2009–2018), (ii) reviewing concept drift detection algorithms to attest its pervasiveness, (iii) comparing distinct ML approaches to mitigate the issue, and (iv) proposing an ML data stream pipeline that outperformed literature approaches, achieving an improvement of 22.05 percentage points of F1Score in the DREBIN dataset, and 8.77 in the AndroZoo dataset.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Malware is a major threat to computer systems and imposes many challenges to cyber security. Targeted threats, such as ransomware, cause millions of dollars in losses every year. The constant increase of malware infections has been motivating popular antiviruses (AVs) to develop dedicated detection strategies, which include meticulously crafted machine learning (ML) pipelines. However, malware developers unceasingly change their samples’ features to bypass detection. This constant evolution of malware samples causes changes to the data distribution (i.e., concept drifts) that directly affect ML model detection rates, something not considered in the majority of the literature work. In this work, we evaluate the impact of concept drift on malware classifiers for two Android datasets: DREBIN (≈130K apps) and a subset of AndroZoo (≈285K apps). We used these datasets to train an Adaptive Random Forest (ARF) classifier, as well as a Stochastic Gradient Descent (SGD) classifier. We also ordered all datasets samples using their VirusTotal submission timestamp and then extracted features from their textual attributes using two algorithms (Word2Vec and TF-IDF). Then, we conducted experiments comparing both feature extractors, classifiers, as well as four drift detectors (Drift Detection Method, Early Drift Detection Method, ADaptive WINdowing, and Kolmogorov–Smirnov WINdowing) to determine the best approach for real environments. Finally, we compare some possible approaches to mitigate concept drift and propose a novel data stream pipeline that updates both the classifier and the feature extractor. To do so, we conducted a longitudinal evaluation by (i) classifying malware samples collected over nine years (2009–2018), (ii) reviewing concept drift detection algorithms to attest its pervasiveness, (iii) comparing distinct ML approaches to mitigate the issue, and (iv) proposing an ML data stream pipeline that outperformed literature approaches, achieving an improvement of 22.05 percentage points of F1Score in the DREBIN dataset, and 8.77 in the AndroZoo dataset. |
Giovanini, Luiz; Ceschin, Fabrício; Silva, Mirela; Chen, Aokun; Kulkarni, Ramchandra; Banda, Sanjay; Lysaght, Madison; Qiao, Heng; Sapountzis, Nikolaos; Sun, Ruimin; Matthews, Brandon; Wu, Dapeng Oliver; Grégio, André; Oliveira, Daniela Online Binary Models are Promising for Distinguishing Temporally Consistent Computer Usage Profiles Journal Article IEEE Transactions on Biometrics, Behavior, and Identity Science, pp. 1-1, 2022. Links | BibTeX @article{9786768,
title = {Online Binary Models are Promising for Distinguishing Temporally Consistent Computer Usage Profiles},
author = {Luiz Giovanini and Fabrício Ceschin and Mirela Silva and Aokun Chen and Ramchandra Kulkarni and Sanjay Banda and Madison Lysaght and Heng Qiao and Nikolaos Sapountzis and Ruimin Sun and Brandon Matthews and Dapeng Oliver Wu and André Grégio and Daniela Oliveira},
url = {https://secret.inf.ufpr.br/papers/UsageProfiles_IEEE_TBIOM_2021.pdf},
doi = {10.1109/TBIOM.2022.3179206},
year = {2022},
date = {2022-06-03},
journal = {IEEE Transactions on Biometrics, Behavior, and Identity Science},
pages = {1-1},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
|
Botacin, Marcus; Moreira, Francis B; Navaux, Philippe O A; Grégio, André; Alves, Marco A Z Terminator: A Secure Coprocessor to Accelerate Real-Time AntiViruses Using Inspection Breakpoints Journal Article ACM Trans. Priv. Secur., 25 (2), 2022, ISSN: 2471-2566. Abstract | Links | BibTeX @article{10.1145/3494535,
title = { Terminator: A Secure Coprocessor to Accelerate Real-Time AntiViruses Using Inspection Breakpoints},
author = {Marcus Botacin and Francis B Moreira and Philippe O A Navaux and André Grégio and Marco A Z Alves},
url = {https://doi.org/10.1145/3494535
https://secret.inf.ufpr.br/papers/marcus_coproc.pdf},
doi = {10.1145/3494535},
issn = {2471-2566},
year = {2022},
date = {2022-03-01},
journal = {ACM Trans. Priv. Secur.},
volume = {25},
number = {2},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {AntiViruses (AVs) are essential to face the myriad of malware threatening Internet users. AVs operate in two modes: on-demand checks and real-time verification. Software-based real-time AVs intercept system and function calls to execute AV’s inspection routines, resulting in significant performance penalties as the monitoring code runs among the suspicious code. Simultaneously, dark silicon problems push the industry to add more specialized accelerators inside the processor to mitigate these integration problems. In this article, we propose Terminator, an AV-specific coprocessor to assist software AVs by outsourcing their matching procedures to the hardware, thus saving CPU cycles and mitigating performance degradation. We designed Terminator to be flexible and compatible with existing AVs by using YARA and ClamAVrules. Our experiments show that our approach can save up to 70 million CPU cycles per rule when outsourcing on-demand checks for matching typical, unmodified YARA rules against a dataset of 30 thousand in-the-wild malware samples. Our proposal eliminates the AV’s need for blocking the CPU to perform full system checks, which can now occur in parallel. We also designed a new inspection breakpoint mechanism that signals to the coprocessor the beginning of a monitored region, allowing it to scan the regions in parallel with their execution. Overall, our mechanism mitigated up to 44% of the overhead imposed to execute and monitor the SPEC benchmark applications in the most challenging scenario.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
AntiViruses (AVs) are essential to face the myriad of malware threatening Internet users. AVs operate in two modes: on-demand checks and real-time verification. Software-based real-time AVs intercept system and function calls to execute AV’s inspection routines, resulting in significant performance penalties as the monitoring code runs among the suspicious code. Simultaneously, dark silicon problems push the industry to add more specialized accelerators inside the processor to mitigate these integration problems. In this article, we propose Terminator, an AV-specific coprocessor to assist software AVs by outsourcing their matching procedures to the hardware, thus saving CPU cycles and mitigating performance degradation. We designed Terminator to be flexible and compatible with existing AVs by using YARA and ClamAVrules. Our experiments show that our approach can save up to 70 million CPU cycles per rule when outsourcing on-demand checks for matching typical, unmodified YARA rules against a dataset of 30 thousand in-the-wild malware samples. Our proposal eliminates the AV’s need for blocking the CPU to perform full system checks, which can now occur in parallel. We also designed a new inspection breakpoint mechanism that signals to the coprocessor the beginning of a monitored region, allowing it to scan the regions in parallel with their execution. Overall, our mechanism mitigated up to 44% of the overhead imposed to execute and monitor the SPEC benchmark applications in the most challenging scenario. |
Botacin, Marcus; Alves, Marco Zanata; Oliveira, Daniela; Grégio, André HEAVEN: A Hardware-Enhanced AntiVirus ENgine to accelerate real-time, signature-based malware detection Journal Article Expert Systems with Applications, pp. 117083, 2022, ISSN: 0957-4174. Abstract | Links | BibTeX @article{BOTACIN2022117083,
title = {HEAVEN: A Hardware-Enhanced AntiVirus ENgine to accelerate real-time, signature-based malware detection},
author = {Marcus Botacin and Marco Zanata Alves and Daniela Oliveira and André Grégio},
url = {https://www.sciencedirect.com/science/article/pii/S0957417422004882
https://secret.inf.ufpr.br/papers/marcus_heaven.pdf},
doi = {https://doi.org/10.1016/j.eswa.2022.117083},
issn = {0957-4174},
year = {2022},
date = {2022-01-01},
journal = {Expert Systems with Applications},
pages = {117083},
abstract = {Antiviruses (AVs) are computing-intensive applications that rely on constant monitoring of OS events and on applying pattern matching procedures on binaries to detect malware. In this paper, we introduce HEAVEN, a framework for Intel x86/x86-64 and MS Windows that combines hardware and software to improve AVs performance. HEAVEN workflow consists of a hardware-assisted signature matching process as its first step (triage), which is fast, and only invokes the software-based AV when the software is suspicious, i.e., with an unknown hardware signature for malignity. We implement a PoC for HEAVEN by instrumenting Intel’s x86/x86-64 branch predictor, which allows for the generation of malware signatures based on branch pattern history. To validate our PoC, we evaluate HEAVEN with a dataset composed of 10,000 malware and 1,000 benign software samples from different categories and accomplished malware detection rates of 100% (no false-positives). The detection occurred before the execution of 10% of the samples’ code. HEAVEN is designed to be memory efficient: it identified unique 32-bit signatures for each sample at the storage cost of only 35KB of SRAM. HEAVEN is also designed with processing efficiency in mind: its hardware extensions present negligible performance overhead and reduces the average workload of the chosen software AV counterpart (ClamWin)—10% for CPU usage, 5.61% for memory throughput, 16.22% for disk writes, and 20.22% for disk reads. With HEAVEN, we may decrease the number of CPU cycles used for malware scanning by 87.5%, which is a promising result regarding the feasibility of our proposal: the combination of hardware-/software-based AVs for practical and effective malware detection that flags suspicious software while posing negligible performance overhead.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Antiviruses (AVs) are computing-intensive applications that rely on constant monitoring of OS events and on applying pattern matching procedures on binaries to detect malware. In this paper, we introduce HEAVEN, a framework for Intel x86/x86-64 and MS Windows that combines hardware and software to improve AVs performance. HEAVEN workflow consists of a hardware-assisted signature matching process as its first step (triage), which is fast, and only invokes the software-based AV when the software is suspicious, i.e., with an unknown hardware signature for malignity. We implement a PoC for HEAVEN by instrumenting Intel’s x86/x86-64 branch predictor, which allows for the generation of malware signatures based on branch pattern history. To validate our PoC, we evaluate HEAVEN with a dataset composed of 10,000 malware and 1,000 benign software samples from different categories and accomplished malware detection rates of 100% (no false-positives). The detection occurred before the execution of 10% of the samples’ code. HEAVEN is designed to be memory efficient: it identified unique 32-bit signatures for each sample at the storage cost of only 35KB of SRAM. HEAVEN is also designed with processing efficiency in mind: its hardware extensions present negligible performance overhead and reduces the average workload of the chosen software AV counterpart (ClamWin)—10% for CPU usage, 5.61% for memory throughput, 16.22% for disk writes, and 20.22% for disk reads. With HEAVEN, we may decrease the number of CPU cycles used for malware scanning by 87.5%, which is a promising result regarding the feasibility of our proposal: the combination of hardware-/software-based AVs for practical and effective malware detection that flags suspicious software while posing negligible performance overhead. |
Botacin, Marcus; Grégio, André Why We Need a Theory of Maliciousness: Hardware Performance Counters in Security Inproceedings Susilo, Willy; Chen, Xiaofeng; Guo, Fuchun; Zhang, Yudi; Intan, Rolly (Ed.): Information Security, pp. 381–389, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-22390-7. Abstract | Links | BibTeX @inproceedings{10.1007/978-3-031-22390-7_22,
title = {Why We Need a Theory of Maliciousness: Hardware Performance Counters in Security},
author = {Marcus Botacin and André Grégio},
editor = {Willy Susilo and Xiaofeng Chen and Fuchun Guo and Yudi Zhang and Rolly Intan},
url = {https://link.springer.com/chapter/10.1007/978-3-031-22390-7_22
secret.inf.ufpr.br/papers/isc_hpc_short.pdf
https://secret.inf.ufpr.br/papers/isc_hpc_extended.pdf},
isbn = {978-3-031-22390-7},
year = {2022},
date = {2022-01-01},
booktitle = {Information Security},
pages = {381--389},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {Hardware Performance Counters (HPCs) are at the center of a research discussion: Is their use effective for malware detection? In this paper, we try to clarify the discussion by evaluating prior work presenting HPC criticism and highlighting their implicit assumptions and the potential research opportunities created by them. We discovered that HPCs are particularly good at detecting malware that exploits architectural side-effects, but not as good as traditional detection approaches at detecting pure-software malware, such that detection approaches must be combined. We also identified that most of the controversy about HPCs originates from researchers not clearly stating which type of malware they were considering. Thus, we claim the need for a theory of maliciousness to better state malware threats and evaluate proposed defenses.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Hardware Performance Counters (HPCs) are at the center of a research discussion: Is their use effective for malware detection? In this paper, we try to clarify the discussion by evaluating prior work presenting HPC criticism and highlighting their implicit assumptions and the potential research opportunities created by them. We discovered that HPCs are particularly good at detecting malware that exploits architectural side-effects, but not as good as traditional detection approaches at detecting pure-software malware, such that detection approaches must be combined. We also identified that most of the controversy about HPCs originates from researchers not clearly stating which type of malware they were considering. Thus, we claim the need for a theory of maliciousness to better state malware threats and evaluate proposed defenses. |