2019 |
Beppler, Tamy; Botacin, Marcus; Ceschin, Fabrício; Oliveira, Luiz E S; Grégio, André L(a)ying in (Test)Bed: How Biased Datasets Produce Impractical Results for Actual Malware Families’ Classification Inproceedings Lin, Zhiqiang; Papamanthou, Charalampos; Polychronakis, Michalis (Ed.): Information Security, pp. 381–401, Springer International Publishing, Cham, 2019, ISBN: 978-3-030-30215-3. Abstract | Links | BibTeX | Tags: learning (artificial intelligence) @inproceedings{10.1007/978-3-030-30215-3_19, title = {L(a)ying in (Test)Bed: How Biased Datasets Produce Impractical Results for Actual Malware Families’ Classification}, author = {Tamy Beppler and Marcus Botacin and Fabrício Ceschin and Luiz E S Oliveira and André Grégio}, editor = {Zhiqiang Lin and Charalampos Papamanthou and Michalis Polychronakis}, url = {https://link.springer.com/chapter/10.1007/978-3-030-30215-3_19 https://secret.inf.ufpr.br//papers/malware_textures_tamy.pdf}, isbn = {978-3-030-30215-3}, year = {2019}, date = {2019-01-01}, booktitle = {Information Security}, pages = {381--401}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {The number of malware variants released daily turned manual analysis into an impractical task. Although potentially faster, automated analysis techniques (e.g., static and dynamic) have shortcomings that are exploited by malware authors to thwart each of them, i.e., prevent malicious software from being detected or classified accordingly. Researchers then invested in traditional machine learning algorithms to try to produce efficient, effective classification methods. The produced models are also prone to errors and attacks. Novel representations of the ``subject'' were proposed to overcome previous limitations, such as malware textures. In this paper, our initial proposal was to evaluate the application of texture analysis for malware classification using samples collected in-the-wild in order to compare them with state-of-the-art results. During our tests, we discovered that texture analysis may be unfeasible for the task at hand, if we use the same malware representation employed by other authors. Furthermore, we also discovered that naive premises associated to the selection of samples in the datasets caused the introduction of biases that, in the end, produced unreal results. Finally, our tests with a broader unfiltered dataset show that texture analysis may be impractical for correct malware classification in a real world scenario, in which there is a great variety of families and some of them make use of quite sophisticate obfuscation techniques.}, keywords = {learning (artificial intelligence)}, pubstate = {published}, tppubtype = {inproceedings} } The number of malware variants released daily turned manual analysis into an impractical task. Although potentially faster, automated analysis techniques (e.g., static and dynamic) have shortcomings that are exploited by malware authors to thwart each of them, i.e., prevent malicious software from being detected or classified accordingly. Researchers then invested in traditional machine learning algorithms to try to produce efficient, effective classification methods. The produced models are also prone to errors and attacks. Novel representations of the ``subject'' were proposed to overcome previous limitations, such as malware textures. In this paper, our initial proposal was to evaluate the application of texture analysis for malware classification using samples collected in-the-wild in order to compare them with state-of-the-art results. During our tests, we discovered that texture analysis may be unfeasible for the task at hand, if we use the same malware representation employed by other authors. Furthermore, we also discovered that naive premises associated to the selection of samples in the datasets caused the introduction of biases that, in the end, produced unreal results. Finally, our tests with a broader unfiltered dataset show that texture analysis may be impractical for correct malware classification in a real world scenario, in which there is a great variety of families and some of them make use of quite sophisticate obfuscation techniques. |
2018 |
Ceschin, Fabrício; Pinage, Felipe; Castilho, Marcos; Menotti, David; Oliveira, Luis S; Gregio, André The Need for Speed: An Analysis of Brazilian Malware Classifiers Journal Article IEEE Security Privacy, 16 (6), pp. 31-41, 2018, ISSN: 1540-7993. Abstract | Links | BibTeX | Tags: Brazilian malware classifers, Feature extraction, invasive software, learning (artificial intelligence), Machine learning, machine-learning systems, malware, malware classification, pattern classification, security, Security of data, Support vector machines @article{8636415, title = {The Need for Speed: An Analysis of Brazilian Malware Classifiers}, author = {Fabrício Ceschin and Felipe Pinage and Marcos Castilho and David Menotti and Luis S Oliveira and André Gregio}, url = {https://secret.inf.ufpr.br/papers/fabricio_needforspeed.pdf}, doi = {10.1109/MSEC.2018.2875369}, issn = {1540-7993}, year = {2018}, date = {2018-11-01}, journal = {IEEE Security Privacy}, volume = {16}, number = {6}, pages = {31-41}, abstract = {Using a dataset containing about 50,000 samples from Brazilian cyberspace, we show that relying solely on conventional machine-learning systems without taking into account the change of the subject's concept decreases the performance of classification, emphasizing the need to update the decision model immediately after concept drift occurs.}, keywords = {Brazilian malware classifers, Feature extraction, invasive software, learning (artificial intelligence), Machine learning, machine-learning systems, malware, malware classification, pattern classification, security, Security of data, Support vector machines}, pubstate = {published}, tppubtype = {article} } Using a dataset containing about 50,000 samples from Brazilian cyberspace, we show that relying solely on conventional machine-learning systems without taking into account the change of the subject's concept decreases the performance of classification, emphasizing the need to update the decision model immediately after concept drift occurs. |
2019 |
L(a)ying in (Test)Bed: How Biased Datasets Produce Impractical Results for Actual Malware Families’ Classification Inproceedings Lin, Zhiqiang; Papamanthou, Charalampos; Polychronakis, Michalis (Ed.): Information Security, pp. 381–401, Springer International Publishing, Cham, 2019, ISBN: 978-3-030-30215-3. |
2018 |
The Need for Speed: An Analysis of Brazilian Malware Classifiers Journal Article IEEE Security Privacy, 16 (6), pp. 31-41, 2018, ISSN: 1540-7993. |