% esel00.bib %%%%% 2000 %%%% as of 7/20/00 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @MASTERSTHESIS{Berk00:MS, author = {Yevgeniy Berkovich}, month = aug, year = 2000, title = {Software Quality Prediction Using Case-Based Reasoning}, note = {Advised by Taghi M. Khoshgoftaar}, address = {Boca Raton, Florida USA}, school = {Florida Atlantic University}, keywords = {Software Metrics, case-based reasoning} } @MASTERSTHESIS{Chen00:MS, author = {Ye Chen}, month = may, year = 2000, title = {Measurement of Coupling and Cohesion of Software}, note = {Advised by Taghi M. Khoshgoftaar}, address = {Boca Raton, Florida USA}, school = {Florida Atlantic University}, keywords = {Software Metrics, Coupling, Cohesion, Information Theory, Metric Properties, Nethack} } @MASTERSTHESIS{Guan00:MS, author = {Xin Guan}, month = may, year = 2000, title = {Cost of Misclassification in Software Quality Models}, note = {Advised by Taghi M. Khoshgoftaar}, address = {Boca Raton, Florida USA}, school = {Florida Atlantic University}, keywords = {Software Metrics, Classification, {MetaCost}, Cost-Boosting, {Adacost}, ROC Analysis, Logistic Regression, Principal Components Analysis} } @ARTICLE{JHKA00:SciCo, author = {Wendell D. Jones and John P. Hudepohl and Taghi M. Khoshgoftaar and Edward B. Allen}, month = {{}}, year = 2000, title = {Application of a Usage Profile in Software Quality Models}, journal = {Science of Computer Programming}, pages = {}, note = {In press.}, publisher = {Elsevier Science}, address = {Amsterdam, Netherlands}, organization = {}, annote = {based on JHKA99:CSMR. Conditionally accepted.}, isbn = {}, inlibrary = {}, abstract = {Faults discovered by customers are an important aspect of software quality. The working hypothesis of this paper is that variables derived from an execution profile can be useful in software quality models. An execution profile of a software system consists of the probability of execution of each module during operations. Execution represents opportunities for customers to discover faults. However, an execution profile over an entire customer-base can be difficult to measure directly. Deployment records of past releases can be a valuable source of data for calculating an approximation to the probability of execution. In this paper, we analyze a metric derived from deployment records which is a practical surrogate for an execution profile in the context of a software quality model. We define ``usage'' as the proportion of systems in the field which have a module deployed. This paper presents a case study of a very large legacy telecommunications system. We developed models to predict whether software modules will have any faults discovered by customers on systems in the field. Static software product metrics and usage were independent variables. The significance levels of variables in logistic regression models were analyzed, and models with and without usage as an independent variable were compared. The case study was empirical evidence that usage can be a significant contributor to a software quality model.}, keywords = {} } @INPROCEEDINGS{KA00:ISACC, author = {Taghi M. Khoshgoftaar and Edward B. Allen}, month = sep, year = 2000, title = {Modeling the Risk of Software Faults}, booktitle = {Conference Proceedings: International Software Assurance Certification Conference}, note = {Proceedings available from Reliable Software Technologies, Sterling, VA USA. In press.}, publisher = {}, address = {Reston, Virginia USA}, organization = {Information Technology Association of America}, annote = {based on FAU TR-CSE-00-06}, isbn = {}, inlibrary = {}, abstract = {Development teams apply various techniques to improve software reliability, such as independent verification and validation (IVV), reengineering, extra reviews, additional testing, and strategic assignment of personnel. Due to resource and time constraints, one must often target reliability enhancement activities to high-risk modules. Software quality models predict which modules should be targeted. This paper synthesizes principles for software-quality modeling. The product of a software quality model is predictions. For example, a model could be designed to predict membership in a fault-prone class for each module or predict the number of faults expected in each module. With predictions in hand, project managers can prioritize and target software enhancement activities toward those modules that need improvement the most. Cost-effective software quality models must be developed through a disciplined methodology that balances accuracy with practical data collection. Drawing on experience with data from a variety of software development organizations, our methodology is based on the following principles: (1) measure the past to predict the future; (2) exploit your gold mines; (3) software metrics are candidate predictors; (4) linear models are not enough; and (5) empirical validation must be realistic. This paper provides an explanation of each principle, illustrated by two published case studies.}, keywords = {Software Reliability, Faults, Fault-Prone Modules, Software Metrics, Classification, Regression Models, Multiple Linear Regression, Curvilinear Regression} } @ARTICLE{KA00:TR, author = {Taghi M. Khoshgoftaar and Edward B. Allen}, month = jun, year = 2000, title = {A Practical Classification Rule for Software Quality Models}, journal = TR, volume = 49, number = 2, note = {In press.}, annote = {see KA97:FAU56}, inlibrary = LIBTR, abstract = {A ``practical'' classification rule for a software quality model is one that considers the needs of the project to use a model to guide targeting software reliability-enhancement efforts, such as extra reviews early in development. Such a rule will often prove more useful than alternative rules. The contribution of this paper is discussion of several classification rules for software quality models, and recommendation of a generalized classification rule, where the effectiveness and efficiency of the model for guiding software reliability-enhancement efforts can be explicitly considered. This is the first application of this rule to software quality modeling that we know of. Two case studies illustrate application of the generalized classification rule. A case study of a telecommunications system models membership in the class of fault-prone modules as a function of the number of interfaces to other modules. A case study of a military system models membership in the class of fault-prone modules as a function of a set of process metrics that depict the development history of a module. These case studies are examples where balanced misclassification rates resulted in more useful and practical software quality models than the other classification rules.}, keywords = {Software Reliability, Classification, Prior Probabilities, Costs of Misclassification, Software Metrics} } @ARTICLE{KA00:RQSE, author = {Taghi M. Khoshgoftaar and Edward B. Allen}, month = dec, year = 2000, title = {Predicting Fault-Prone Software Modules in Embedded Systems with Classification Trees}, journal = {International Journal of Reliability, Quality and Safety Engineering}, volume = 7, number = 4, note = {In press.}, annote = {See KA99:HASE}, abstract = {Embedded-computer systems have become essential to life in modern society. For example, the backbone of society's information infrastructure is telecommunications. Embedded systems must have highly reliable software, so that we avoid the severe consequences of failures, intolerable down-time, and expensive repairs in remote locations. Moreover, today's fast-moving technology marketplace mandates that embedded systems evolve, resulting in multiple software releases embedded in multiple products. Software quality models can be valuable tools for software engineering of embedded systems, because some software-enhancement techniques are so expensive or time-consuming that it is not practical to apply them to all modules. Targeting such enhancement techniques is an effective way to reduce the likelihood of faults discovered in the field. Research has shown software metrics to be useful predictors of software faults. A software quality model is developed using measurements and fault data from a past release. The calibrated model is then applied to modules currently under development. Such models yield predictions on a module-by-module basis. This paper examines the Classification And Regression Trees (CART) algorithm for building tree-based models that predict which software modules have high risk of faults to be discovered during operations. CART is attractive because it emphasizes pruning to achieve robust models. This paper presents details on the CART algorithm in the context of software engineering of embedded systems. We illustrate this approach with a case study of four consecutive releases of software embedded in a large telecommunications system. The level of accuracy achieved in the case study would be useful to developers of an embedded system. The case study indicated that this model would continue to be useful over several releases as the system evolves.}, keywords = {High Assurance, Embedded Systems, Software Reliability, Software Metrics, Fault-Prone Modules, Classification Trees, CART} } @ARTICLE{KA00:SQJ, author = {Taghi M. Khoshgoftaar and Edward B. Allen}, month = {}, year = 2000, title = {Empirical Assessment of a Software Metric: {T}he Information Content of Operators}, journal = SQJ, note = {In press.}, inlibrary = LIBESE, abstract = {This paper presents an empirical case study that predicted faults in modules based on the total information content of the operators. This metric is closely related to Harrison's Average Information Content Classification (AICC), which is the entropy of the operators. Most information theory-based metrics proposed in the literature have not been subjected to empirical predictive studies of real-world software systems. In contrast, this study shows that a simple information theory-based metric can be more useful for prediction of software quality than comparable metrics based on counts in the context of a commercial software development organization. Three models were considered, all based on operators as an abstraction of software. The model based on information content of the operators made more accurate predictions than two similar models based on the number of operators and the number of unique operators. The purpose of this paper is a fair comparison of the three metrics, rather than developing an optimal model. We have long advocated multivariate models for industrial use. The case study considered three large commercial systems, written in assembly language, and developed consecutively by professional programmers. The first system was used to estimate parameters of the models. The subsequent two were used to evaluate the accuracy of model predictions.}, keywords = {Software Measurement, Software Metrics, Software Quality, Prediction, Linear Regression, Information Theory, Entropy} } @ARTICLE{KAJH00:ASE, author = {Taghi M. Khoshgoftaar and Edward B. Allen and Wendell D. Jones and John P. Hudepohl}, year = 2000, title = {Accuracy of Software Quality Models Over Multiple Releases}, journal = {Annals of Software Engineering}, volume = 6, note = {In press.}, publisher = {J.\ C.\ Baltzer}, abstract = {Many evolving mission-critical systems must have high software reliability. However, it is often difficult to identify fault-prone modules early enough in a development cycle to guide software enhancement efforts effectively and efficiently. Software quality models can yield timely predictions of membership in the fault-prone class on a module-by-module basis, enabling one to target enhancement techniques. However, it is an open empirical question, ``Can a software quality model remain useful over several releases?'' Most prior software quality studies have examined only one release of a system, evaluating the model with modules from the same release. We conducted a case study of a large legacy telecommunications system where measurements on one software release were used to build models, and three subsequent releases of the same system were used to evaluate model accuracy. This is a realistic assessment of model accuracy, closely simulating actual use of a software quality model. A module was considered fault-prone if any of its faults were discovered by customers. These faults are extremely expensive due to consequent loss of service and emergency repair efforts. We found that the model maintained useful accuracy over several releases. These findings are initial empirical evidence that software quality models can remain useful as a system is maintained by a stable software development process.}, keywords = {Empirical Studies, Software Maintenance, Software Reliability, Software Metrics, Fault-Prone Modules, Classification, Costs of Misclassification, Logistic Regression} } @ARTICLE{KAJH00:TR, author = {Taghi M. Khoshgoftaar and Edward B. Allen and Wendell D. Jones and John P. Hudepohl}, month = mar, year = 2000, title = {Classification Tree Models of Software Quality Over Multiple Releases}, journal = TR, volume = 49, number = 1, note = {In press.}, annote = {Expanded version of KAJH99:ISSRE}, inlibrary = LIBTR, abstract = {This paper presents an empirical study that evaluated software quality models over several releases to address the question, ``How long will a model yield useful predictions?'' This paper also introduces the Classification And Regression Trees (CART) algorithm to software reliability engineering practitioners. CART is amenable to achieving a preferred balance between the two types of misclassification rates. This is desirable because misclassifications of fault-prone modules often have much more severe consequences than misclassifications of those that are not fault-prone. The case study developed two classification-tree models based on four consecutive releases of a very large legacy telecommunications system. Forty-two software product, process, and execution metrics were candidate predictors. The first model used measurements of the first release as the training data set. This model had eleven significant predictors out of forty-two candidates. The second model used measurements of the second release as the training data set This model had fifteen significant predictors out of forty-two candidates. Measurements of subsequent releases were evaluation data sets. Analysis of the models' predictors yielded insights into various software development practices. Both models had accuracy that would be useful to developers. One might suppose that software quality models lose their value very quickly over successive releases due to evolution of the product and the underlying development processes. We found the models remained useful over all the releases studied.}, keywords = {Software Reliability, Software Metrics, Fault-Prone Modules, Classification Trees, CART} } @INPROCEEDINGS{KAS00:RQD, author = {Taghi M. Khoshgoftaar and Edward B. Allen and Ruqun Shan}, editor = {Hoang Pham and Ming-Wei Lu}, month = aug, year = 2000, title = {Benefits of Principal Components Analysis with Classification Trees of Fault-Prone Software Modules}, booktitle = {Proceedings: Sixth {ISSAT} International Conference on Reliability and Quality in Design}, note = {Invited paper. In press.}, address = {Orlando, Florida USA}, organization = {International Society of Science and Applied Technologies}, abstract = {Software reliability has become a major concern in many software-intensive industries, because faults in source code can cause operational failures with serious consequences. This paper confirms prior work showing that classification trees can be useful to identify fault-prone modules based on the pattern of software metrics and furthermore, shows that principal components analysis can be a useful supporting technique. We conducted a case study of a very large telecommunications system. Tree models built by the Classification And Regression Trees (CART) algorithm illustrated the potential benefits of preprocessing data with principal components analysis.}, keywords = {Software Metrics, Fault-Prone Modules, Classification Trees, CART, Principal Components Analysis} } @INPROCEEDINGS{KAS00:ISSRE, author = {Taghi M. Khoshgoftaar and Edward B. Allen and Ruqun Shan}, month = oct, year = 2000, title = {Improving Tree-Based Models of Software Quality with Principal Components Analysis}, booktitle = {Proceedings: {E}leventh International Symposium on Software Reliability Engineering}, note = {In press.}, address = {San Jose, California USA}, organization = IEEECS, abstract = {Software-quality classification models can predict which modules will be considered fault-prone, or not, based on software product metrics, process metrics, and execution metrics. Such predictions can be used to target improvement efforts to those modules that need it the most. Classification-tree modeling is a robust technique for building such software quality models. However, model structure may be unstable and accuracy may suffer when predictors are highly correlated. This paper presents an empirical case study of four releases of a very large telecommunications systems which showed that the tree-based models can be improved by transforming the predictors with principal components analysis, so that transformed predictors are not correlated. The case study used the regression-tree algorithm in the S-Plus package and then applied our general decision rule to classify modules.}, keywords = {Software Quality, Software Metrics, Fault-Prone Modules, Classification Trees, S-Plus, Principal Components Analysis} } @INPROCEEDINGS{KAT00:ISSRE, author = {Taghi M. Khoshgoftaar and Edward B. Allen and Vishal Thaker}, month = oct, year = 2000, title = {Modeling Fault-Prone Modules of Subsystems}, booktitle = {Proceedings: {E}leventh International Symposium on Software Reliability Engineering}, note = {In press.}, address = {San Jose, California USA}, organization = IEEECS, abstract = {Software developers have a keen interest in knowing which modules are likely to have faults discovered by customers. Accurate predictions enable developers to avoid rework by targeting software enhancement activities prior to release. Many case studies in the literature build models to predict which modules will be fault-prone without regard to subsystems defined by the system's functional architecture. Our hypothesis is this: models that are specially built for subsystems will be more accurate than a system-wide model applied to each subsystem's modules. In other words, the subsystem that a module belongs to can be valuable information in software quality modeling. This paper presents an empirical case study which compared software quality models of an entire system to models of a major functional subsystem. The study modeled a very large telecommunications system with classification trees built by the Classification And Regression Trees algorithm (CART). For predicting subsystem quality, we found that a model built with training data on the subsystem alone was more accurate than a similar model built with training data on the entire system. We concluded that characteristics of the subsystem's modules were not similar to those of the system as a whole, and thus, information on subsystems can be valuable.}, keywords = {Software Reliability, Software Quality Models, Empirical Study, Software Metrics, Fault-Prone Modules, Classification Trees, CART} } @INPROCEEDINGS{KAX00:ASSET, author = {Taghi M. Khoshgoftaar and Edward B. Allen and Zhiwei Xu}, month = mar, year = 2000, title = {Predicting Testability of Program Modules Using a Neural Network}, booktitle = {Proceedings: {S}ymposium on Application-Specific Systems and Software Engineering Technology}, note = {In press.}, address = {Richardson, Texas USA}, organization = IEEECS, abstract = {Voas defines testability as the probability that a test case will fail if the program has a fault. It is defined in the context of an oracle for the test, and a distribution of test cases, usually emulating operations. Because testability is a dynamic attribute of software, it is very computation-intensive to measure directly. This paper presents a case study of real-time avionics software to predict the testability of each module from static measurements of source code. The static software metrics take much less computation than direct measurement of testability. Thus, a model based on inexpensive measurements could be an economical way to take advantage of testability attributes during software development. We found that neural networks are a promising technique for building such predictive models, because they are able to model nonlinearities in relationships. Our goal is to predict a quantity between zero and one whose distribution is highly skewed toward zero. This is very difficult for standard statistical techniques. In other words, high-testability modules present a challenging prediction problem that is appropriate for neural networks.}, keywords = {Testability, Neural Network, Software Metrics, Principal Components Analysis} } @INPROCEEDINGS{KSA00:HASE , AUTHOR = { Taghi M. Khoshgoftaar and Ruqun Shan and Edward B. Allen } , TITLE = { Using Product, Process, and Execution Metrics to Predict Fault-Prone Software Modules with Classification Trees } , BOOKTITLE = {Proceedings: Fifth {IEEE} International Symposium on High-Assurance Systems Engineering } , YEAR = 2000 , EDITOR = {} , ORGANIZATION = IEEECS , MONTH = nov , ADDRESS = {Albuquerque, New Mexico USA} , PUBLISHER = {} , PAGES = {} , NOTE = {In press.} , KEYWORDS = { software quality, software metrics, fault-prone modules, classification trees, S-Plus, principal components analysis } , ANNOTE = {} , INLIBRARY = {} , ISBN = {} , ABSTRACT = { Software-quality classification models can make predictions to guide improvement efforts to those modules that need it the most. Based on software metrics, a model can predict which modules will be considered fault-prone, or not. In this paper, we consider a module fault-prone if any faults were discovered by customers. Useful predictions are contingent on the availability of candidate predictors that are actually related to faults discovered by customers. With a diverse set of candidate predictors in hand, classification-tree modeling is a robust technique for building such software quality models. This paper presents an empirical case study of four releases of a very large telecommunications system. The case study used the regression-tree algorithm in the S-Plus package and then applied our general decision rule to classify modules. Results showed that in addition to product metrics, process metrics and execution metrics were significant predictors of faults discovered by customers. } } @ARTICLE{KYA00:ESE, author = {Taghi M. Khoshgoftaar and Edward B. Allen}, month = {}, year = 2000, title = { Balancing Misclassification Rates in Classification-Tree Models of Software Quality }, journal = ESE, volume = 5, number = {}, pages = {}, note = {In press.}, annote = {}, inlibrary = LIBTR, abstract = { Software product and process metrics can be useful predictors of which modules are likely to have faults during operations. Developers and managers can use such predictions by software quality models to focus enhancement efforts before release. However, in practice, software quality modeling methods in the literature may not produce a useful balance between the two kinds of misclassification rates, especially when there are few faulty modules. This paper presents a practical classification rule in the context of classification tree models that allows appropriate emphasis on each type of misclassification according to the needs of the project. This is especially important when the faulty modules are rare. An industrial case study using classification trees, illustrates the tradeoffs. The trees were built using the TREEDISC algorithm which is a refinement of the CHAID algorithm. We examined two releases of a very large telecommunications system, and built models suited to two points in the development life cycle: the end of coding and the end of beta testing. Both trees had only five significant predictors, out of 28 and 42 candidates, respectively. We interpreted the structure of the classification trees, and we found the models had useful accuracy. }, keywords = { classification trees, CHAID, TREEDISC, telecommunications, software quality, fault-prone modules, software metrics, knowledge discovery in data bases } } @MASTERSTHESIS{Mao00:MS, author = {Wenlei Mao}, month = may, year = 2000, title = {Classification of Software Quality Using Tree Modeling with the {SPRINT/SLIQ} Algorithm}, note = {Advised by Taghi M. Khoshgoftaar}, address = {Boca Raton, Florida USA}, school = {Florida Atlantic University}, keywords = {Software Metrics, Classification Trees, SPRINT, SLIQ, Software Quality} } @INPROCEEDINGS{SK00:RQD, author = {Robert M. Szabo and Taghi M. Khoshgoftaar}, editor = {Hoang Pham and Ming-Wei Lu}, month = aug, year = 1999, title = {Classifying Software Modules Into Three Risk Groups}, booktitle = {Proceedings: Sixth {ISSAT} International Conference on Reliability and Quality in Design}, note = {Invited paper. In press.}, address = {Orlando, Florida USA}, organization = {International Society of Science and Applied Technologies} } @MASTERSTHESIS{Thak00:MS, author = {Vishal Thaker}, month = aug, year = 2000, title = {Modeling Fault-Prone Modules of Subsystems}, note = {Advised by Taghi M. Khoshgoftaar}, address = {Boca Raton, Florida USA}, school = {Florida Atlantic University}, keywords = {Software Metrics, classification, CART} } @INPROCEEDINGS{XKA00:RQD, author = {Zhiwei Xu and Taghi M. Khoshgoftaar and Edward B. Allen}, editor = {Hoang Pham and Ming-Wei Lu}, month = aug, year = 2000, title = {Application of Fuzzy Linear Regression Model for Predicting Program Faults}, booktitle = {Proceedings: Sixth {ISSAT} International Conference on Reliability and Quality in Design}, note = {Invited paper. In press.}, address = {Orlando, Florida USA}, organization = {International Society of Science and Applied Technologies} } @INPROCEEDINGS{XKA00:HASE , AUTHOR = { Zhiwei Xu and Taghi M. Khoshgoftaar and Edward B. Allen } , TITLE = { Prediction of Software Faults Using Fuzzy Nonlinear Regression Modeling } , BOOKTITLE = {Proceedings: Fifth {IEEE} International Symposium on High-Assurance Systems Engineering } , YEAR = 2000 , EDITOR = {} , ORGANIZATION = IEEECS , MONTH = nov , ADDRESS = {Albuquerque, New Mexico USA} , PUBLISHER = {} , PAGES = {} , NOTE = {In press.} , KEYWORDS = { software reliability, fuzzy nonlinear regression model, software metrics, neural networks, multiple linear regression } , ANNOTE = {} , INLIBRARY = {} , ISBN = {} , ABSTRACT = { } } @INPROCEEDINGS{YKAG00:ASSET, author = {Xiaohong Yuan and Taghi M. Khoshgoftaar and Edward B. Allen and K. Ganesan}, month = mar, year = 2000, title = {An Application of Fuzzy Clustering to Software Quality Prediction}, booktitle = {Proceedings: {S}ymposium on Application-Specific Systems and Software Engineering Technology}, note = {In press.}, address = {Richardson, Texas USA}, organization = IEEECS, abstract = {The ever increasing demand for high software reliability requires more robust modeling techniques for software quality prediction. This paper presents a modeling technique that integrates fuzzy subtractive clustering with module-order modeling for software quality prediction. First fuzzy subtractive clustering is used to predict the number of faults, then module-order modeling is used to predict whether modules are fault-prone or not. Note that multiple linear regression is a special case of fuzzy subtractive clustering. We conducted a case study of a large legacy telecommunication system to predict whether each module will be considered fault-prone. The case study found that using fuzzy subtractive clustering and module-order modeling, one can classify modules which will likely have faults discovered by customers with useful accuracy prior to release.}, keywords = {Fuzzy Logic, Fuzzy Inference, Subtractive Clustering, Software Metrics, Software Quality, Telecommunications Systems} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%