% Author: Jens Nödler, noedler@web.de, http://noedler.de

% To build this document:
%  1. Run LaTeX, BibTeX, MakeIndex (makeindex <file>.nlo -s nomencl.ist -o <file>.nls)
%  2. Run LaTeX -> DVI -> PS -> PDF

% Standards for this document:
% - This \LaTeX source is best viewed with tab width of six spaces.
% - Usage of British English (except for source code and AE quotations indeed)
% - Empty XML element tags end with '/>' instead of ' />'.
% - Definitions are taken from the ISTQB, Sommerville book, or the Design Patterns book
% - URL are surrounded by \url{...} and only end with '/' when referencing a directory
% - The term 'like' is avoided and instead 'such as' or 'for example' is used. 
%   For examples, see: http://grammar.quickanddirtytips.com/like-versus-as.aspx
% - Use the regexes ' an [^aeioux]' and ' a [aeioux]' to find places of wrong usages.


% \documentclass[a4paper,11pt]{report}
\documentclass[a4paper,11pt,twoside]{report}

\usepackage[ngerman,english]{babel}

% input encoding ISO 8859-15 (Latin-9) (includes the Euro symbol: ¤)
\usepackage[latin9]{inputenc}

% "Virtual fonts for PDF-files with T1 encoded CMR-fonts" and enables umlauts!
\usepackage{ae}

% Change the name of ``References''
\AtBeginDocument{\renewcommand\refname{Bibliography}}

% Abbreviations and Acronyms
\usepackage{nomencl}
\let\abbrev\nomenclature
\renewcommand{\nomname}{Abbreviations and Acronyms}
\setlength{\nomlabelwidth}{.25\hsize}
\renewcommand{\nomlabel}[1]{#1 \dotfill}
\makenomenclature

% Teaching LaTeX some new hyphenations
\hyphenation{stan-dard stan-dards frame-work frame-works name-space name-spaces}

% Disable single lines at the start of a paragraph (Schusterjungen)
\clubpenalty = 10000
% Disable single lines at the end of a paragraph (Hurenkinder)
\widowpenalty = 10000
\displaywidowpenalty = 10000

\usepackage[
	bookmarks=true,		% Create bookmarks (as the TOC)
	bookmarksnumbered=true, % Number the bookmarks (as the TOC)
	bookmarksopen=true,	% Show the bookmarks when opening the PDF
	pdfstartview=FitH,	% Fit page width when opening the PDF
	pdfborder={0 0 0},	% no borders around the links
]{hyperref}

% Use URLs and set the font size for URLs
\usepackage{url}
\def\UrlFont{\scriptsize\ttfamily}

% Allow line breaks in URLs when using the LaTeX > DVI > PS > PDF workflow
% MUST be loaded after the hyperref package!
\usepackage[hyphenbreaks]{breakurl}

% \ref{} jumps to the definition of the label but instead it should
% jump to the heading itself. Requires: \usepackage{hyperref}
\usepackage[all]{hypcap}

% Preserves this error: ``destination with the same identifier 
% (name{page.1}) has been already used, duplicate ignored''
\hypersetup{pageanchor=false}

% Some user-defined commands
\newcommand{\ttcn}{\mbox{TTCN-3}}
\newcommand{\code}[1]{\mbox{\texttt{\scriptsize#1}}}
\newcommand{\brcode}[1]{\texttt{\scriptsize\hyphenchar\font=`.#1}} % allow linebreaks at dots
\newcommand{\brrcode}[1]{\texttt{\scriptsize\hyphenchar\font=`-#1}} % allow linebreaks at hyphens
\newcommand{\headcolumn}[1]{\sffamily{\textit{#1}}}
\newcommand{\paraskip}{\vspace{2mm}}

% Enables bold ttfamily fonts
\usepackage{luximono}
% More about bold ttfamily:
% http://www.tex.ac.uk/cgi-bin/texfaq2html?label=bold-extras
% http://ubuntuforums.org/showthread.php?t=170324

% Colured panels behind specified columns in a table
\usepackage{colortbl}

% The pstricks stuff
\usepackage{pstricks}
\usepackage{pst-node}
% \usepackage{pst-tree}
\newcommand{\psbox}[2]{\psframebox[framearc=.15]{\parbox{#1}{\centering #2}}}

% Redefine the color "lightgray"
\usepackage{color}
\definecolor{lightgray}{gray}{0.92}

% Use a sans-serif font
% \usepackage{cmbright}

% Adjusts line spacing
% \usepackage{setspace}
% \setstretch{1.04}

% -- <listings> --
\usepackage{listings}
% First define common settings for all languages
\lstset{
	tabsize=2,
	showstringspaces=false,
	basicstyle=\scriptsize\ttfamily,
	% Display strings, comments and identifier like basicstyle
	stringstyle=,
	commentstyle=,
	identifierstyle=,
	keywordstyle=\ttfamily\textbf,
	numbers=left,
	numberstyle=\scriptsize\ttfamily,
	backgroundcolor=\color{lightgray},
	% Draw a small line around listings
	frame=single,
	% Extra space before and below listings
%  	aboveskip=\smallskipamount\smallskip,
% 	belowskip=\medskipamount\medskip,
	breaklines=true,
	escapeinside={/*@}{@*/},
	% captions below listings
	captionpos=b,
	% if the float is defined here it is IGNORED!
	% it needs to be defined inside \lstdefinestyle{}!
% 	float=h,
}
% TTCN-3 v2.2.1 (2002-08)
\lstdefinelanguage{TTCN-3}{
	morekeywords={action,activate,address,all,alt,altstep,and,and4b,any,anytype,bitstring,boolean,call,catch,char,
		charstring,check,clear,complement,component,connect,const,control,create,deactivate,default,disconnect,
		display,do,done,else,encode,enumerated,error,except,exception,execute,extends,extension,external,fail,
		false,float,for,from,function,getverdict,getcall,getreply,goto,group,hexstring,if,ifpresent,import,in,
		inconc,infinity,inout,integer,interleave,label,language,length,log,map,match,message,mixed,mod,modifies,
		module,modulepar,mtc,noblock,none,not,not4b,nowait,null,objid,octetstring,of,omit,on,optional,or,or4b,
		out,override,param,Modulepar,pass,pattern,port,procedure,raise,read,receive,record,recursive,rem,repeat,
		reply,return,running,runs,self,send,sender,set,setverdict,signature,start,stop,subset,superset,system,
		template,testcase,timeout,timer,to,trigger,true,type,union, universal,unmap,value,valueof,var,variant,
		verdicttype,while,with, xor,xor4b},
	sensitive,
	morecomment=[s]{/*}{*/},
	morecomment=[l]//,
	morestring=[b]",
	morestring=[b]',
}
% XQuery v1.0 (2007-01), http://www.w3.org/TR/xquery/#id-terminal-delimitation
\lstdefinelanguage{XQuery}{
	morekeywords={ancestor,ancestor-or-self,and,as,ascending,at,attribute,base-uri,boundary-space,by,case,cast,castable,
		child,collation,comment,construction,copy-namespaces,declare,descendant,descendant-or-self,descending,
		div,document,document-node,element,else,empty,empty-sequence,encoding,eq,every,except,external,following,
		following-sibling,for,function,ge,greatest,gt,idiv,if,import,in,inherit,instance,intersect,item,lax,le,
		least,let,lt,mod,module,namespace,ne,node,no-inherit,no-preserve,option,or,order,ordered,ordering,
		preceding,preceding-sibling,preserve,processing-instruction,return,satisfies,schema,schema-attribute,schema-element,
		self,some,stable,strict,strip,text,then,treat,typeswitch,union,unordered,validate,variable,version,where,xquery},
	% removed from the list of keywords: parent,is,of,to,default
	otherkeywords={-,;,::,:=,!=,?,?>,/,//,/>,.,..,(\#,[,],]]>,\{,\},@,\$,*,\#),+,<,<!--,<![CDATA[,<?,</,<<,<=,=,>,-->,>=,>>,|},
	sensitive,
	morecomment=[s]{(:}{:)},
	morestring=[b]",
	morestring=[b]',
}
% It seams to be needless to define own styles if the properties of \lstset{}
% are not overwritten here. But it leaves the flexibility to do so later. :D 
\lstdefinestyle{TTCN3}{
	language=TTCN-3,
	float=h,
}
\lstdefinestyle{Java}{
	language=Java,
	float=h,
}
\lstdefinestyle{XQuery}{
	language=XQuery,
	float=h,
}
\lstdefinestyle{XQueryNoFloat}{
	language=XQuery,
}
\lstdefinestyle{XML}{
	language=XML,
	float=h,
}
\lstdefinestyle{none}{
	float=h,
}
% -- </listings> --

% This MUST be included after the listing package
\usepackage{pst-uml}

% Used for a better control of the page numbering
\usepackage{fancyhdr}
\fancypagestyle{plain}{
	\fancyhf{} % clear all header and footer fields (hence, no page numbers)
	\renewcommand{\headrulewidth}{0pt}
	\renewcommand{\footrulewidth}{0pt}
}

% Allows resizing text area of the pages
\usepackage{typearea}


\title{
	\sffamily\Large Master's Thesis \\
	\vspace{8mm}
	\sffamily\LARGE\textbf{\mbox{An XML-based Approach for Software Analysis}} \\
	\vspace{1mm}
	\sffamily\Large\textbf{\mbox{ }\mbox{ }Applied to Detect Bad Smells in TTCN-3 Test Suites}
}
\author{\sffamily\Large Jens M. Nödler}
\date{
	\sffamily\Large 2007-10-26 \\ % ISO 8601 standard (yyyy-mm-dd)
	\vspace{88mm}
	Supervised by Dr.~Helmut Neukirchen \\
	Co-supervised by Prof.~Dr.~Jens Grabowski \\
	Software Engineering for Distributed Systems Group \\
	Institute for Computer Science \\
	University of Göttingen, Germany \\
}


\begin{document}



% Set page to a size like defined by the report stlye
\areaset[22mm] % additional margin left
	{12.7cm} % width
	{21.2cm} % heigth



% Header empty, page number in footer:
\pagestyle{plain}
% Use headings for each page including current page number and section title
% \pagestyle{headings}


\maketitle


% dedication
{
~\\
\vspace{30mm}
~\\
\raggedleft
\noindent To everybody who contributed to this thesis,\\
supported my work, and shared time with me.\\
\vspace{5mm}~\\
I appreciate it.\\
}
\pagebreak

% ``Given unlimited resources, the majority of software problems can probably be solved but the challenge for software engineers is to produce high-quality software with a finite amount of resources and to a predicted schedule.'' -- Ian Sommerville



\begin{abstract}

\noindent
This thesis presents an XML-based approach for software analysis. Patterns in software artefacts which should be found by the analysis are described using the declarative XML query language XQuery. Implementation and design of a software analysis framework are presented. The design of the framework allows patterns to be described in a generic, abstract, and reusable way. The framework is customised for the detection of places in source codes which should be refactored, so-called bad smells. As a case study, the framework is used to detect bad smells in test suites written in the testing language TTCN-3.

\vspace{2cm}

\begin{otherlanguage}{ngerman}
\begin{center} 
\textbf{Zusammenfassung} \\
\end{center}
\noindent
Diese Arbeit präsentiert einen XML-basierten Ansatz zur Software-Analyse. Software-Muster, die von der Analyse gefunden werden sollen, sind mittels der deklarativen XML-Abfragesprache XQuery beschrieben. Das Design des implementierten Software-Analyse-Frameworks ermöglichen eine generische, abstrakte und wiederverwendbare Musterbeschreibung. Das Framework ist angepasst für die Erkennung von Stellen in Software-Quelltexten, die mittels Refactoring überarbeitet werden sollten. Diese Stellen werden als "`Bad Smells"' bezeichnet. Als Fallbeispiel wird das Framework benutzt, um solche "`Bad Smells"' in Testfällen zu finden, die in der Test-Sprache TTCN-3 beschrieben sind.
\end{otherlanguage}

\end{abstract}

% Keywords: Bad smell, code smell, smells, testing, test, testcase, test suite, TTCN, TTCN-3, software analysis, program analysis, software analysis framework, static analysis, XML, XQuery, XPath, XAF, smell detection, pattern detection, pattern language, query language, source code, Eclipse, TRex, TPTP, Saxon


% Just done to fit the TOC nicely on two pages...
\areaset[22mm] % additional margin left
	{12.7cm} % width
	{17cm} % heigth

\setcounter{secnumdepth}{3} % Allows to have a section to be numbered 2.4.4.1
\setcounter{tocdepth}{1}    % ... but the TOC only shows the depth up to 2.4
\tableofcontents

% Reset page to size like defined by the report stlye
\areaset[22mm] % additional margin left
	{12.7cm} % width
	{21.2cm} % heigth


\chapter{Introduction} \label{ch_intro}

\fancypagestyle{plain}{
	\fancyhf{} % clear all header and footer fields
	\fancyfoot[LO,RE]{\thepage} % page numbers (left on odd pages, rigth on even pages)
	\renewcommand{\headrulewidth}{0pt}
	\renewcommand{\footrulewidth}{0pt}
}
% enable page numbers
\pagestyle{plain}
% start with page number 1
\pagenumbering{arabic}

Software is continuously changing as the customer's requirements are growing and varying. This makes software more complex and can lead to so-called \emph{software ageing}. The cost of maintaining and enhancing existing software is a deciding factor in a software's lifetime~\cite{mendoncca2004bfr}. About 50\% of the software developer's time is consumed by browsing and comprehending existing source code~\cite{vonmayrhauser1995pcd}. Maintenance activities took over 60\% of all software development costs in the last three decades~\cite{pressman2005sep}. These numbers reveal the importance and usefulness of analysis technologies as the starting point for the enhancement of software quality, reusability, and maintainability~\cite{kullbach1999qet}.

Software analysis can be seen as the counterpart of software engineering. While the latter describes processes for the development of new software, the analysis is an elementary part in examining and understanding existing software. The analysis of software concerns internal properties like source code and external properties like runtime behaviour. Software can be analysed regarding different qualities like functionality, reliability, and maintainability for instance~\cite{ISO9126}. The analysis of internal structures of software is not restricted to reverse engineering or reengineering and can be used amongst others for the following tasks: conformance checking regarding coding styles, extraction of software facts, calculation of source code metrics, control and data flow analysis, and detection of source code anomalies and security-related bugs~\cite[Chap.~22]{sommerville}.
%  The analysis of source code's internal structure is the main focus of this thesis.

Refactoring enhances the internal structure of source code without changing the observable behaviour. The purpose is to increase the software's maintainability and comprehensibility. Refactoring started as a manual process separated from the actual software development. Nowadays refactoring is widely adopted, automated, and tightly integrated with the software development~\cite[Chap.~14]{refactoring}. Still it is up to the developer's experience and intuition to know \emph{where} to apply refactorings. Locations in the source code which should be refactored are called \emph{bad smells}. Hence, deploying software analysis for the automated detection of bad smells is the next consequent step towards higher software quality. 

Bad smells can be understood as patterns in source code. Describing patterns using a declarative language provides good readability, reusability, and extensibility. Hence, the main goal of this thesis is to develop a declarative way for the description of patterns like bad smells. The XML query language XQuery provides a declarative syntax and is used for the description of software patterns. Therefore, software that is to be analysed, needs to be represented using XML.

The usage of XML and XQuery is bundled in a software analysis framework developed as a part of this thesis. The framework provides the possibility to describe patterns in a generic and reusable way. This is achieved by abstracting from the underlying source code and using XQuery for the pattern description and detection. The framework's design allows arbitrary kinds of analyses: pattern-based detection of anomalies such as bad smells and interactive source code queries for instance.

An application of the framework is the detection of bad smells in \ttcn{} source code. The testing language \ttcn{} is a standardised language for the implementation of test suites for black-box testing. The syntax of the \ttcn{} core notation is very close to those of other common programming languages. Hence, bad smells can also occur in \ttcn{} test suites. The framework is integrated in the \ttcn{} tool TRex which already has refactoring capabilities. The automated smell detection helps bridging the gap towards a smoother refactoring process.

% The rising demand to analysis and query source code is demonstrated by the success of online search engines for source code in the last years. Sites like koders.com, krugle.com, and Google Code Search index publicly available source code and allow to query it. Even though the query power is limited to regular expressions at most, these sites are not only used to find example source code snippets but also for the detection of bugs.\footnote{\burl{http://asert.arbornetworks.com/2006/10/static-code-analysis-using-google-code-search/}}


\section{Structure of this Thesis}

This thesis is structured in seven chapters. Following to this introduction, chapter~\ref{ch_foundations} provides foundations which reoccur throughout this thesis. This includes patterns in software artefacts like bad smells, an introduction to the testing language \ttcn{}, a preface to XML and related technologies, and an excursion concerning the representation of software artefacts. Related work regarding strategies for the detection of software patterns like bad smells is discussed in chapter~\ref{ch_relatedwork}. As a part of this thesis, a software analysis framework is developed whose requirements and design are presented in chapter~\ref{ch_req_and_design}. In the subsequent chapter~\ref{ch_impl}, the implementation of the XML-based analysis framework and its adaption for the detection of bad smells in \ttcn{} test suites is discussed in depth. The penultimate chapter~\ref{ch_5} shows applications of the framework and discusses the extension and customisation of the framework to new fields of application. Finally, chapter~\ref{ch_summary} provides a summary of this thesis and an outlook towards future work.


\chapter{Foundations} \label{ch_foundations}

This chapter introduces foundations which occur throughout this thesis. Section~\ref{patterns} gives an overview of different terms related to patterns in software artefacts. The main topic of this thesis is software analysis which is applied to the testing language \ttcn{}. The language is introduced in section~\ref{ttcn3} by describing its features and syntax. Section~\ref{xml} gives an introduction to the \emph{Extensible Markup Language} (XML) and the XML-related technologies XPath and XQuery. These XML technologies are used as the backbone of the software analysis. As the representation of software is elementary for the analysis of software, different representation formats for software artefacts are introduced in section~\ref{sw_representations}. Readers who are already familiar with the presented foundations may skip single sections or the complete chapter~\ref{ch_foundations}.


\section{Patterns in Software Artefacts} \label{patterns}

\abbrev{ISTQB}{International Software Testing Qualification Board}

This section introduces terms describing patterns in software artefacts. As these terms do not only apply to patterns which are restricted to the source code level but also apply to the architectural level of software, the all-embracing term \emph{software artefacts} is used for both levels. No strict definition is available for most of the terms to be introduced. Therefore, they are explained on a descriptive level and differentiations between them are made. By contrast, a few terms are defined by the \emph{International Software Testing Qualification Board} (ISTQB) in the \emph{Standard glossary of terms used in Software Testing}~\cite{istqb} and are used in this thesis wherever applicable.

\emph{Patterns} were first used by the architect Christopher Alexander~\cite{alexander1977plt} in the late~70s as a formal way of documenting successful solutions to problems regarding urban planning and building architecture. They provide high-level architectural descriptions in a generic and reuseable way instead of addressing individual cases on a low level. Each pattern consists of four essential elements: a unique name to establish a common vocabulary, a context in which the pattern might be applied, a description of the problem that the pattern is capable to solve, and the description how to solve this problem. These four elements apply to all kinds of patterns, not only to those for buildings, as described by Alexander.

In the mid~90s the term \emph{design patterns} was coined by a group of four researchers called ``The Gang of Four''~\cite{GammaEtAl} in the field of software architecture. Instead of talking about patterns in buildings and towns as Alexander did, they used the term design patterns to describe reuseable elements of object-oriented software. As the term \emph{design} in design pattern states, they provide solutions on an architectural level. The design patterns were identified by their common usage to solve recurring problems. A design pattern catalogue, introduced in~\cite{GammaEtAl}, uses a detailed format to describe each design pattern including motivation, applicability, graphical representation, implementation, examples, and related patterns. The patterns of the catalogue are grouped by their main purpose: creational patterns (like \emph{Singleton} and \emph{Factory Method}), structural patterns (like \emph{Adapter} and \emph{Facade}), and behavioural patterns (like \emph{Iterator} and \emph{Observer}).

The idea of patterns was widely adopted and is nowadays used in many more topics than the design of object-oriented software. The term \emph{bug patterns} for example is at least used twice: to describe recurring relationships between signalled errors and underlying bugs in programs regarding debugging~\cite{bugpatterns} and also as ``error-prone coding practices that arise from the use of erroneous design patterns, misunderstanding of language semantics, or simple and common mistakes''~\cite{FindBugs04}. \emph{Test patterns}~\cite{xunit} provide common solutions for problems in nearly all phases of the test process. For example, the patterns \emph{Mock Object} and \emph{Fresh Fixture}. Patterns are however not limited to the technical level and can also be used at the organisational and management level~\cite{coplien2004opa} for instance.

\emph{Anti-patterns}~\cite{antipatterns} are the counterpart of patterns and describe commonly occurring solutions to problems leading to negative consequences. The idea of anti-patterns (which are sometimes also called \emph{pitfalls}) is to show how \emph{not} to solve a problem. Like patterns, anti-patterns are not limited to the design of software: design anti-patterns are just one category of anti-patterns. Others are project management, programming, methodological, and organisational anti-patterns. The programming anti-pattern \emph{Spaghetti Code} for example describes a program with a software structure that lacks clarity and is therefore hard to maintain and extend because of its complexity~\cite{designdefects}.

\emph{Design defects} are related to design patterns. While design patterns propose proven solutions to recurring design problems in object-oriented architectures, design defects describe occurring errors in the design of software~\cite{designdefects2, designdefects}. These errors might originate from the absence or the wrong usage of design patterns, which is ``the most common mistake in using design patterns''~\cite[page~8]{antipatterns}.

Once a design defect, design anti-pattern or programming anti-pattern is identified, a method called \emph{refactoring} can be applied to transform the software towards the usage of design patterns~\cite{kerievsky}. Refactoring is defined by Fowler as ``a change made to the internal structure of software to make it easier to understand and cheaper to modify without changing its observable behavior''~\cite{refactoring}. Analogous to the mentioned catalogue of design patterns, Fowler created a refactoring catalogue consisting of seven categories where each refactoring is presented using a unique name, summary, motivation, mechanics, and examples. Common refactorings are \emph{Rename}, \emph{Encapsulate Field}, and \emph{Extract Method}.

\paraskip

\emph{Bad smells} are ``certain structures in the code that suggest (sometimes they scream for) the possibility of refactoring''~\cite[Chap.~3]{refactoring}. According to this definition, bad smells are located at the source code level and are therefore often called \emph{code smells}. The term \emph{refactoring opportunities} is also used as a synonym for bad smells. Since the term bad smells was introduced, different other kinds of smells were identified: project smells~\cite{andrea2002cxp} (that relate to project management anti-patterns), architectural and design smells (that are located at the design level and relate to design defects), and test smells identified by Meszaros~\cite{xunit}. According to Meszaros, three kinds of test smells can be found: smells related to the source code level of tests, behavioural smells that affect the outcome of a test execution, and project smells as indicators of the overall health of a test project which do not involve looking at test code or execution of tests.

The original list of bad smells was presented by Fowler and Beck~\cite[Chap.~3]{refactoring} additionally to the refactoring catalogue in the same book. Each smell consists of a unique name, a textual description and proposed refactorings to remove the smell. The list of smells is less well structured compared to the catalogues of design patterns and refactorings where each pattern/refactoring follows a strict format. Another drawback of the list of bad smells is the missing topical grouping into categories like the design patterns/refactorings catalogues. The reasons to remove bad smells (as well as design/programming anti-patterns and design defects) by refactoring are various. The most import reasons are: improve the design of software, make software easier to understand, find bugs, and to programme more efficiently~\cite[Chap.~2]{refactoring}. Altogether, this helps to reduce the maintenance costs and to deliver faster and more reliable software.

Exemplary, the common smells \emph{Duplicated Code} and \emph{Long Parameter List} are presented in a summarised way as they appeared in the original list of bad smells~\cite[Chap.~3]{refactoring}:
\begin{itemize}
\item \emph{Duplicated Code} is the self-describing name for the same code structure found in more than one places. Fowler's advice is to unify the places where the duplicates occur, depending on the their relation to each other. ``The simplest duplicated code problem is when you have the same expression in two methods of the same class''~\cite[page~63]{refactoring}. The recommended refactoring in this case is \emph{Extract Method} whereas \emph{Extract Class} is recommended if the duplicated code occurs in different classes.
\item The smell \emph{Long Parameter List} applies to methods that have many (for example more than six) parameters. In object-oriented systems long parameter lists should be avoided and instead the method's class should provide most of the required data. Fowler suggests to apply the refactoring \emph{Introduce Parameter Object} to use one object as parameter that contains all required data or \emph{Replace Parameter with Method} to let the called method collect the required data itself by calling all necessary methods.
\end{itemize}

\paraskip

\emph{Software metrics} are strongly related to bad smells as they can be used to assess the quality of code according to the ISO/IEC~9126 standard~\cite{ISO9126}. The ISTQB defines metrics as an overloaded term: ``a measurement scale and the method used for measurement''~\cite{istqb}. In addition to this definition, the measurement scale must be refined to suit the requirements of software metrics. Hence, the scale must be at least an interval, better a ratio or absolute scale to be able to compare and rank the values of the scale~\cite{pandian2003smg}. Applying this definition to the metric \emph{Lines of Code} means that the method used for the measurement is counting all non-empty lines of the source code. The scale of the measurement is the result of the method used for the measurement. In this case the absolute scale would be the number of lines of code, allowing it to compare the results of this metric applied to different entities.

Another definition of metrics originates from Fenton and Pfleeger~\cite{fenton1997smr} whereby software metrics embraces all activities which involve software measurement. This definition is much wider compared to the one of the ISTQB and includes measures for properties and attributes of entities like processes, resources, and products. Furthermore, it can be distinguished between external and internal attributes, whereat external attributes can be measured only with respect to other entities in their environment. Internal attributes can be measured by examining the products, processes, or resources themselves, and can be separated from their environments. An internal attribute is the concept of internal quality of the ISO/IEC~9126 standard~\cite{ISO9126}. An example for an internal attribute is the size of the source code of a product as it can be measured in isolation. In contrast, productivity is an external attribute of an entity. For the measurement of the external attributes of a product, execution of the product is required, whereas for measuring internal attributes, static code analysis is sufficient in most cases~\cite{Zei06+}.

Regardless of their definitions, metrics can be grouped into two categories: \emph{size metrics} (like \emph{Program Volume}~\cite{halstead}) are based on the number of occurrences of certain language constructs and \emph{structural metrics} (like McCabe's \emph{Cyclomatic Number}~\cite{McCabe76}) are calculated with respect to the structural layout of a program, for instance its branching characteristics.

\paraskip

Bad smells and metrics are related to the source code level while design anti-patterns and especially design defects cover defects on an architectural level. Design patterns, refactorings, and bad smells refer to the object-oriented programming in general and in particular to the programming language \emph{Java}. This means using terms such as \emph{class} and \emph{method} in the context of Java and describing patterns that apply to features specific to Java. However, most of the underlying ideas can be applied for other programming languages as well---even procedural and functional ones.


\section{TTCN-3 Testing Language} \label{ttcn3}

This section introduces the testing language \ttcn{} by describing its features and syntax---exemplified by a test case written in \ttcn. Afterwards, in section~\ref{ttcn3smell} the \emph{\ttcn{} Code Smell Catalogue} is presented which is an adaption of the list of bad smells for \ttcn.

\subsection{The TTCN-3 Core Language} \label{ttcn3core}

\abbrev{TTCN-3}{Testing and Test Control Notation Version~3}
\abbrev{SUT}{System Under Test}
\abbrev{MTC}{Main Test Component}
\abbrev{ETSI}{European Telecommunications Standards Institute}
\abbrev{ITU-T}{ITU Telecommunication Standardization Sector}
\abbrev{ITU}{International Telecommunication Union}
\abbrev{HTTP}{Hypertext Transfer Protocol}

\ttcn{} is a standardised language for the specification and implementation of test suites for black-box testing. The acronym stands for \emph{Testing and Test Control Notation Version~3}. \ttcn{} is widely used for the testing of the implementations of telecommunication and Internet protocols. \ttcn{} test suites reflect the requirements a \emph{System Under Test} (SUT) has to fulfil. Hence, the test execution is the verification of the SUT~\cite{sommerville}. The \ttcn{} standard~\cite{t3standard1} is maintained by the \emph{European Telecommunications Standards Institute} (ETSI) and the first edition of the standard was published in May 2001~\cite{t3standard1-1stedition}. It is also approved by the ITU Telecommunication Standardization Sector (ITU-T)~\cite{t3standard1-ITU} as part of the \emph{Z-series}, which is a collection of standards about languages and general software aspects for telecommunication systems.

``\ttcn{}'s main feature is the separation of concern between Abstract Test Suites and the Adapter Layer, which allows full portability of test suites and thus make them independent of any platform implementation.''\footnote{\url{http://www.site.uottawa.ca/~bernard/ttcn3_in_a_nutshell.html}} For instance, a~\ttcn{} test suite for a specific protocol can be used to test different implementations without the need to change the test suite itself---only the adaptor layer might need an update. Hence, test suites written in \ttcn{} provide a high level of reusability. This adaptor layer is called \emph{Real Test System Interface} and is located between the \ttcn{} test system and the SUT as illustrated in figure~\ref{ttcn3_overview}. Other major capabilities of \ttcn{} are the support of different presentation formats, dynamic concurrent test configurations, operations for synchronous and asynchronous communications, and data and signature templates with powerful matching mechanisms~\cite{t3evolution}.

The latest edition of the \ttcn{} standard is ``Edition~3 Version~2'' (v3.2.1) published in February~2007, which consists of 10~documents whereat the first one specifies the core notation~\cite{t3standard1} of the language---a textual syntax similar to other procedural programming languages but aware of test-specific extensions~\cite[Chap.~1]{ttcn3wiley}. The remaining documents describe two alternative presentation formats of \ttcn{} (Tabular Presentation Format~\cite{t3standard2} and Graphical Presentation Format~\cite{t3standard3}), the operational semantics of \ttcn{}, components to connect the test suite to the SUT, components to control and log the test execution, several language bindings, and data exchange mappings. Altogether, \ttcn{} is more than just a language for the specification of test cases---it is a framework supporting a broad range of the testing process. Every time \ttcn{} is mentioned in this thesis, the core language of \ttcn{}~\cite{t3standard1} is meant---not \ttcn{} as a testing framework---because the software analysis is applied to the core language.

Therefore a closer look at the \ttcn{} core language is required. The building blocks of \ttcn{} are modules, which contain all \ttcn{} code and consist of a definition and a control part whereat both are optional. Modules may import definitions from other modules and can be parametrised to provide more flexibility. Declarations such as data structures and functions are made within the definition part---the control part in turn specifies in which order, under which preconditions, and with which parameters test cases are executed~\cite[Chap.~3]{ttcn3wiley}. The term \emph{test suite} is synonymous with a complete \ttcn{} module containing test cases and a control part. The body of a simple \ttcn{} module as shown in listing~\ref{ttcn3_module} consists of a declaration (line~2) and an empty control part only containing a comment (line~3).

\begin{lstlisting}[style=TTCN3, caption=The body of a \ttcn{} module, label=ttcn3_module]
module MyModule {
	const integer c_integer := 23;
 	control {  /* [...] */  }
}
\end{lstlisting}

\ttcn{} is a typed language with a large number of built-in types namely integer, float, boolean, different string types and a verdict type which reflects the result of a test case. Allowed verdict values are \code{pass}, \code{fail}, \code{inconc}, \code{none}, and \code{error}. Common programming constructs such as conditions and loops are also supported as specialised language constructs related to testing such as timers, default behaviour, and alternative behaviour based on the reactions of the SUT~\cite{t3standard1}.

The communication inside the \ttcn{} test system and the communication with the SUT is abstracted by user-defined ports. They facilitate communication between test components and the SUT and also among the test components itself. Figure~\ref{ttcn3_overview} illustrates that a test suite consists of at least one \emph{Main Test Component} (MTC) and optionally additional \emph{Parallel Test Components} (PTC). Ports are used for the communication between MTC and PTCs and for the communication between MTC/PTCs and the SUT. The operations on ports provide message-based and procedure-based communication capabilities.

\input{pstricks/ch2_ttcn3_ports}

For instance, the system under test is an \emph{Hypertext Transfer Protocol} (HTTP) server. An according module must be defined which comprises the port definitions for sending and receiving requests and responses as shown in listing~\ref{ttcn3_http_ports} in lines~2 and~3. These ports must be assigned to a component that reflects the interface of an HTTP \emph{client} (lines~5--8) as the test suite plays the role of the client. Indeed, these abstract ports need to be mapped to the real interfaces of the SUT what invokes the adaption layer of \ttcn{}. This layer is not described in detail here, as it is not relevant for the understanding of the \ttcn{} core language.

\begin{lstlisting}[style=TTCN3, caption={HTTP module, part 1: definition of ports and a component}, label=ttcn3_http_ports]
module HTTP {
	type port PortTypeOutput message { out charstring }
	type port PortTypeInput  message { in  charstring }

	type component HTTPClientComponentType {
		port PortTypeInput  InputPort;
		port PortTypeOutput OutputPort;
	}
\end{lstlisting}

\emph{Templates} are a special kind of data structure providing parametrisation and matching mechanisms for specifying test data to be sent or received over test ports. Templates are used to either transmit a set of distinct values or to test whether a set of received values matches the template specification~\cite[Chap.~10]{ttcn3wiley}. 

Dynamic test behaviour is expressed using test cases. \ttcn{} statements include powerful behavioural description mechanisms: functions and testcases are used to specify and structure test behaviour, to define default behaviour and to structure modules. The alt statement denotes branching of test behaviour due to the reception and handling of communication and/or timer events and/or the termination of parallel test components. Each alt statement consists of at least one branch including a condition and an action to be executed. Branches may be extracted into own function-like statements (called \emph{altstep}) for their reuse or the activation of an altstep as a default behaviour in an alt statement~\cite{t3standard1}.

In the next step the \ttcn{} module \code{HTTP} from listing~\ref{ttcn3_http_ports} is extended to test an HTTP request/response cycle. Therefore a template that matches a response is defined in line~9 of listing~\ref{ttcn3_http_testcase} using a regular expression pattern. A test case should test the SUT which is abstracted using components. Therefore the test case \code{HTTPHeadRequest} is defined to run on the HTTP client component (line~11) as defined in listing~\ref{ttcn3_http_ports}. That allows to access the ports of this component and to send an HTTP request through the output port to the server (line~14). The following alt statement consists of three steps that cover different possible responses of the server. In the first case (lines~17--19) an HTTP response that matches the template \code{HTTPResponse} is received via the input port and the test verdict is set to \code{pass}. The other steps cover potential error conditions. The second alternative step (lines~20--22) sets the verdict to \code{fail} if a response is received that does \emph{not} match the template \code{HTTPResponse}. The third altstep is triggered if no response at all is received within 5~seconds (lines~23--25) based on the timer that was started (in line~13) right before the request was send.

\begin{lstlisting}[style=TTCN3, firstnumber=9, caption={HTTP module, part 2: testing a request/response cycle}, label=ttcn3_http_testcase]
	template charstring HTTPResponse := pattern "HTTP/1.1 \d\d\d *";

	testcase HTTPHeadRequest() runs on HTTPClientComponentType {
		timer t;
		t.start(5.0);
		OutputPort.send("HEAD /index.html HTTP/1.1");

		alt {
			[] InputPort.receive(HTTPResponse) {
				setverdict(pass);
			}
			[] InputPort.receive {
				setverdict(fail);
			}
			[] t.timeout {
				setverdict(fail);
			}
		}
	}
}
\end{lstlisting}


\subsection{The TTCN-3 Code Smell Catalogue} \label{ttcn3smell}

The \emph{\ttcn{} Code Smell Catalogue} is the first approach to adopt the bad smells of Fowler~\cite{refactoring} for the core language of \ttcn. It was first presented in the master's thesis of Bisanz~\cite{Bisanz06} and is now maintained by the Software Engineering for Distributed Systems Group at the University of Göttingen~\cite{Neukirchen07, Neukirchen08}. 

The catalogue for \ttcn{} differs in many points from the original list of bad smells:
\begin{itemize}
\item The catalogue contains not only bad smells that can be found in (more or less) any programming language but also additionally smells that only apply to \ttcn.
\item Even though removing bad smells by refactoring the source code means by definition to preserve the behaviour of the program, the catalogue contains some smells that cannot be removed without changing the observable behaviour.
\item The bad smells of the catalogue are grouped in ten different categories to add structure and clarity. The following categories contain general as well as \ttcn{}-specific smells: \emph{Duplicated Code}, \emph{References}, \emph{Parameters}, \emph{Complexity}, \emph{Coding Standards}, \emph{Data Flow Anomalies}, and \emph{Miscellaneous}. Additionally, these categories only contain smells related to \ttcn: \emph{Default Anomalies}, \emph{Test Behaviour}, and \emph{Test Configuration}.
\item Each bad smell follows a strict format describing the smell in depth: \emph{Name}, \emph{Derived from}, \emph{Description}, \emph{Motivation}, \emph{Options} (optional), \emph{Related action(s)}, and \emph{Example}.
\item Following the categorisation of test smells by Meszaros~\cite{xunit} the code smell catalogue contains source code smells and behaviour smells.
\end{itemize}

As one goal of this thesis is to detect the bad smells of this catalogue in \ttcn{} test suites, two smell examples are quoted~\cite{Bisanz06}:

\begin{description}
\item[Name:] Singular Template Reference
\item[Derived from:]~\cite{Zei06} (Motivation for \emph{Inline Template})
\item[Description:] A template definition is referenced only once.
\item[Motivation:] If a template definition is referenced only once, it can be inlined without duplicating code. This can improve readability if the template definition is not too complex; in case of a very complex template a separate template definition can still be preferable.
\item[Related action(s):] \emph{Inline Template}
\item[Example:] In listing~\ref{listings-SingularTemplateReference}, assume the only reference to the template named \code{exampleTemplate} (lines \mbox{\ref{code:SingularTemplateReference:1a}--\ref{code:SingularTemplateReference:1b}}) is the one in test case \code{exampleTestCase} (line~\ref{code:SingularTemplateReference:2}). In this case, the template could be inlined to reduce code length and improve readability.
\end{description}
\lstinputlisting[style=TTCN3, caption={The bad smell \emph{Singular Template Reference}}, label=listings-SingularTemplateReference]{listings/SingularTemplateReference.ttcn3}

\begin{description}
\item[Name:] Long Parameter List
\item[Derived from:]~\cite{refactoring} (\emph{Long Parameter List})
\item[Description:] High number of formal parameters.
\item[Motivation:] Long parameter lists are hard to read and should be avoided. Although this smell is more relevant for object-oriented languages (because method parameters can be replaced by attributes within a class), a group of single parameters can be replaced by a record type parameter. If the calling behavioural entity (function, test case or altstep) gets a parameter by calling another function or altstep and does not need the parameter by itself, \emph{Replace Parameter with Function} can be applied.
\item[Related action(s):] \emph{Replace Parameter with Function} or \emph{Introduce Record Type Parameter}
\item[Example:] Listing~\ref{listings-LongParameterList} contains the signature of a function with six integer parameters (line~\ref{code:LongParameterList:1}). Instead a set or record type could be used.
\end{description}
\lstinputlisting[style=TTCN3, caption={The bad smell \emph{Long Parameter List}}, label=listings-LongParameterList]{listings/LongParameterList.ttcn3}



\section{XML Technologies} \label{xml}

This section introduces the \emph{Extensible Markup Language} (XML) and the XML query languages XPath and XQuery. All technologies are used for the software analysis framework presented in chapter~\ref{ch_impl}: XML as representation format for \ttcn{} source code and XPath/XQuery to analyse the XML-encoded \ttcn{} source code for specific characteristics and patterns.


\subsection{Extensible Markup Language}

\abbrev{XML}{Extensible Markup Language}
\abbrev{W3C}{World Wide Web Consortium}
\abbrev{SGML}{Standard Generalized Markup Language}
\abbrev{DTD}{Document Type Definition}
\abbrev{BNF}{Backus--Naur Form}
\abbrev{ISO}{International Organization for Standardization}

The ``\emph{Extensible Markup Language} (XML) is a simple, very flexible text format [\ldots] playing an increasingly important role in the exchange of a wide variety of data on the Web and elsewhere.''\footnote{\url{http://www.w3.org/XML/}} XML is a free, platform independent, open standard~\cite{XML1.0} provided by the \emph{World Wide Web Consortium} (W3C) and a subset of the Standard Generalized Markup Language (SGML, ISO~8879). As the name Extensible Markup Language already implies, the language addresses the markup of data by adding structural information to it. XML is extensible because it does not provide a set of predefined markup tags as HTML~\cite{HTML4.01} or \LaTeX{}. In contrast, XML is a meta markup language that defines a syntax to create new domain-specific markup languages~\cite[Chap.~1]{XMLbible}. To illustrate the XML syntax an example of a new defined language is given in listing~\ref{listing_xml_simpsons} that structures a few seasons, episodes, and titles of the TV sitcom ``The~Simpsons'' as an XML document.

\begin{lstlisting}[style=XML, caption=XML example: ``The Simpsons'' seasons and episodes, label=listing_xml_simpsons]
<?xml version="1.0" encoding="UTF-8"?>
<simpsons>
	<season number="1">
		<episode number="1" title="Simpsons Roasting on an Open Fire"/>
		<episode number="2" title="Bart the Genius"/>
		<!-- [...] -->
	</season>
	<season number="19">
		<episode number="401" title="He Loves to Fly and He D'oh's"/>
		<episode number="402" title="The Homer of Seville"/>
		<!-- [...] -->
	</season>
</simpsons>
\end{lstlisting}

Listing~\ref{listing_xml_simpsons} illustrates the most important properties of an XML document. First of all it should be human readable and self describing---but that is up to the author of the XML language. According to the XML specification each document consists of an optional prolog (line~1, also called ``XML declaration'') and exactly \textit{one} root element (in the example \code{simpsons}, line~2--13). The logical structure is based on elements whereas each element has a start tag (like \code{<simpsons>}, line~2) and end tag (like \code{</simpsons>}, line~13). Each element may contain content between its start and end tag (for instance the content ``D'oh!'' of an element \code{homer}: \code{<homer>D'oh!</homer>}), attributes (like the \code{number}, and \code{title} attributes in the example), an arbitrary number of nested elements or may be empty (for example \code{<no-content/>}). XML comments (lines~6 and~11) start with the character sequence \code{<!-\mbox{-}} and end with~\code{\mbox{-}->}.

An XML document following these rules is called ``well-formed''. This is the minimal requirement for a document to be referred to as XML document at all~\cite[Section~2.1]{XML1.0}. More sophisticated XML documents are called ``valid'' if they have an associated \emph{Document Type Definition} (DTD) and if the document complies with the constraints expressed in the DTD. The property ``valid'' includes the property ``well-formed''~\cite[Section~2.8]{XML1.0}. DTDs define the structure of an XML document in terms of production rules similar to the \emph{Backus--Naur Form} (BNF). DTDs are an integral part of the XML specifications~\cite{XML1.0, XML1.1} and can be inlined in the XML document or included from another file. Adding the \emph{Document Type Declaration} statement \code{<!DOCTYPE simpsons SYSTEM "simpsons.dtd">} to the example behind the XML prolog would include the DTD shown in listing~\ref{listing_xml_simpsons_dtd}.

% More about the BNF comparison: http://lists.w3.org/Archives/Public/www-math/msg00885.html

\begin{lstlisting}[style=XML, caption=Document Type Definition for the Simpsons example, label=listing_xml_simpsons_dtd]
<!ELEMENT simpsons (season*)>
<!ELEMENT season (episode*)>
<!ELEMENT episode EMPTY>
<!ATTLIST season number CDATA #REQUIRED>
<!ATTLIST episode 
	number CDATA #REQUIRED
	title  CDATA #REQUIRED>
\end{lstlisting}

The first three lines of the DTD define the allowed elements, their relation among each other, and the frequency they may occur. The root element \code{simpsons} may contain zero or more \code{season} elements (line~1), which may contain any number of \code{episode} elements (line~2). The \code{episode} element itself is an empty element (line~3) that must contain two attributes named \code{number} and \code{title} (lines~5--7). Line~4 specifies an obligatory attribute \code{number} for all \code{season} elements. Using a validating XML processor---a software that is capable of checking an XML document against its DTD---permits to find missing or needless elements and attributes and to ensure that the document is valid.

With the knowledge of DTDs in mind an XML document is said to be an instance of a DTD~\cite[Chap.~8]{XMLbible}. Hence, the recommended way to create new XML languages is by designing the Document Type Definition first---comparable to relational databases where the entity-relationship model must be set up first~\cite[Chap.~2]{DatabaseDesign}. Instead of DTDs, that are part of the XML specifications~\cite{XML1.0, XML1.1}, more powerful and expressive approaches such as XML~Schema~\cite{XML-Schema-1.0-Part0} or RELAX~NG~\cite{RelaxNG} can be used for modelling and validating XML languages.

If an XML document makes use of more than one XML language definition, name collisions might occur as different languages may use elements with the same name for instance. Additionally, the problem of recognising which elements belong to which XML language emerge in such XML documents. To avoid these problems, XML namespaces~\cite{XML-Namespaces} can be used to provide uniquely named elements and attributes. Each XML element is bound to a unique identifier---its XML namespace. The result is an \emph{expanded name} which consists of the namespace name and the local name (like the element name). This allows to identify each node and avoids name collisions as XML names only need to be unique it their own namespace.

Currently two versions of XML co-exist: XML version~1.0~\cite{XML1.0} and version~1.1~\cite{XML1.1}. Although the latter seems to supersede the first version, the usage of XML~1.0 is still recommended, because it serves most use cases~\cite[Chap.~2]{XMLnutshell} and the tool support for XML~1.0 is considerable better than for version~1.1. The main difference between both specifications is that the ``overall philosophy of names [of elements, attributes etc.] has changed since XML~1.0. Whereas XML~1.0 provided a rigid definition of names, wherein everything that was not permitted was forbidden, XML~1.1 names are designed so that everything that is not forbidden (for a specific reason) is permitted''~\cite{XML1.1}. Names in XML~1.0 are only allowed to used characters defined by the Unicode~2.0 standard~\cite{Unicode2}. But as Unicode is being enhanced to version~5.0~\cite{Unicode5} and beyond, it would have required new versions of XML everytime new characters were included in the Unicode standard---or to allow all characters, as done in version~1.1. For reasons of clarity: this only effects XML names and not the content of an XML document. Hence, it is possible to use Unicode~5.0 characters for the content of an XML~1.0 document but not for its names.


\subsection{XML Path Language} \label{xpath}

\abbrev{XPath}{XML Path Language}
\abbrev{XSLT}{XSL Transformations}
\abbrev{XSL}{Extensible Stylesheet Language}

The main purpose of the \emph{XML Path Language} (XPath)~\cite{XPath1.0, XPath2.0} is to navigate through the tree structure of an XML document, to address certain nodes and to select parts of the document using a simple declarative syntax. For example, the path expression \code{//season} would evaluate to a sequence containing all \code{season} elements (including their child elements) from the XML document shown in listing~\ref{listing_xml_simpsons}. The expression \code{//episode[@number > 400]} would return a sequence of all \code{episode} elements with an attribute named \code{number} containing a value larger than~400.

Two versions of the XPath standard have been released as recommendations by the W3C: XPath~1.0~\cite{XPath1.0} in 1999 and XPath~2.0~\cite{XPath2.0} in 2007. Even though both version's main purpose stays the same, they differ significantly from each other. Version~1.0 was defined to ``provide a common syntax and semantics for functionality shared between \emph{XSL Transformations} (XSLT)~\cite{XSLT1.0} and XPointer~\cite{XPointer}''~\cite{XPath1.0, XMLbible}. XPath~2.0 in turn is designed to be embedded in host languages like XSLT~2.0~\cite{XSLT2.0} and XQuery~1.0~\cite{XQuery1.0}. In contrast to the specification of XPath~1.0, that includes not only the syntax and semantics of the language but also the data model, built-in library functions, and operators, XPath~2.0 is defined in a set of specialised specifications. The main document is called ``XML Path Language (XPath) 2.0''~\cite{XPath2.0} and mostly describes the concepts and syntax of XPath, meanwhile the semantics, data model, built-in functions and operators are shared with XQuery~1.0 and therefore specified in combined recommendations: ``XQuery~1.0 and XPath~2.0 Formal Semantics''~\cite{XPaXQ-Semantics}, ``XQuery~1.0 and XPath~2.0 Data Model (XDM)''~\cite{XPaXQ-XDM} and ``XQuery~1.0 and XPath~2.0 Functions and Operators''~\cite{XPaXQ-FaO}.

Compared to its successor, XPath~1.0 is a simple ``one-line language'' being less expressive and flexible as it is mainly capable of path expressions (\code{//season} for instance) whereas XPath~2.0 supports more sophisticated expressions like conditional and loop constructs. Hence, XPath~2.0 is a superset of XPath~1.0 including more language expressions and a lot of more built-in functions. Additionally, XPath~2.0 supports a richer set of data types, to take advantage of type information when documents are validated using XML~Schema. From here on, the term XPath always refers to version~2.0.

The XPath data model~\cite{XPaXQ-XDM} features two elementary properties making the handling of nodes more efficient. The first is called \emph{document order}, which is defined as the order of all nodes in which they appear in the XML document. When nodes are selected from the document, they are output in the document order by default. The document order also allows to compare the position of nodes in a before/behind relation. The second elementary property is called \emph{node identity}, which assigns a unique identity to each node what allows to differentiate between comparing nodes based on their value and their identity.

As the name XPath already indicates, path expressions are elementary constructs of the language. A path expression can be used to locate nodes within XML trees and consists of a series of one or more steps, separated by \code{/} or \code{//}, and optionally beginning with \code{/} or \code{//}. Steps are defined as axis steps or filter expressions. Axis steps define the direction of movement in the tree and filter expressions test if a selected node holds a specified condition. An axis step returns all nodes that are reachable from the context node via a specified axis and that fulfil the filter expression~\cite{XMLnutshell}.

XPath supports different forward and backward axes which reflect the direction of movement in the document order of the XML tree. Forward means nodes after or beneath the current context node; backward means nodes before or above the current context node~\cite[Chap.~3]{XQueryBrundage}. Because the axes are important for XPath all axes and their meanings are presented in table~\ref{xpath_axes}.

\begin{table}
\centering
\label{xpath_axes}
	\begin{tabular}{lp{8.5cm}}
	\rowcolor{lightgray}
	\headcolumn{Axis}			& \headcolumn{Meaning} \\
	\hline \noalign{\vspace{1mm}}
	\code{child::}			& Children of the context node. \\
	\code{descendant::}		& All children of the context node. \\
	\code{attribute::}		& Attributes of the context node. \\
	\code{self::}			& The context node itself. \\
	\code{descendant-or-self::}	& The context node and its descendants. \\
	\code{following-sibling::}	& All siblings of the context node that follow it. \\
	\code{following::}		& All nodes that follow the context node in the document. \\
	\code{parent::}			& The parent of the context node. \\
	\code{ancestor::}			& All ancestors of the context node. \\
	\code{ancestor-or-self::}	& The context node and all its ancestors. \\
	\code{preceding-sibling::}	& All the siblings of the context node that precede it. \\
	\code{preceding::}		& All nodes that precede the context node in the document. \\
	\hline
	\end{tabular}
\caption{All axes provided by XPath and their meanings}
\end{table}

For the most frequently used axes an abbreviated syntax is defined: \code{descendant-or-self::} is shortened to ``\code{//}'', \code{parent::} to ``\code{..}'', \code{self::} to ``\code{.}'', \code{attribute::} to ``\code{@}'', and \code{child::} can be omitted completely. Using the abbreviated syntax---which is favoured in this thesis wherever possible---means writing \code{episode/@number} instead of \code{child::episode/attribute::number} for example.

Looking again at the example \code{//episode[@number > 400]} that was already given at the introduction to XPath, it is now possible to disassemble the path expression step by step. Path expressions are evaluated from the left to the right side: the slashes \code{//} traverse the descendant-or-self axis of the XML tree (listing~\ref{listing_xml_simpsons}) starting from the root node, searching element nodes named \code{episode}, and selecting each as the current context node. A filter expression \code{[condition]} is now applied to the context node and the attribute axis is inspected for an attribute named \code{number} holding a value larger than 400. If this condition is true, the context node is included in the resulting sequence. All matching nodes are returned in document order as shown in listing~\ref{listing_xpath_sample_1_result}.

\begin{lstlisting}[style=XML, caption=Result of the XPath expression \code{//episode[@number > 400]}, label=listing_xpath_sample_1_result]
<episode number="401" title="He Loves to Fly and He D'oh's"/>
<episode number="402" title="The Homer of Seville"/>
\end{lstlisting}

Note that the result of an XPath expression is not needed to be well-formed or valid XML nor XML at all---it is just a sequence of items. An item is either an atomic value (of a primitive simple type defined by the XPath data model) or a node~\cite{XPaXQ-XDM}. Conforming with the XPath data model, seven nodes types are differentiated. The most important ones are: document nodes, element nodes, attribute nodes, text nodes, and comment nodes. An XPath result example is the sequence \code{(23, "foo", <bar/>)} which contains two atomic values and one element node. Another example is the path expression \code{//episode/@title} which returns a sequence of all \code{title} attribute nodes of \code{episode} elements.

Beside path expressions, sequence expressions are relevant for XPath. First of all, to construct a new sequence a pair of parentheses \code{($\cdots$)} is used to surround all elements of a sequence. The elements themselves are delimited by commas. For example, the sequence of the first seven Fibonacci numbers \code{(0, 1, 1, 2, 3, 5, 8)} or the sequence of all Simpsons season and episode numbers \code{(//season/@number, //episode/@number)}.

As already seen in the example above, filter expressions can be applied to any sequence to cherry-pick the desired nodes. The most simple filter expression uses the position of a node in its sequence. Hence, \code{(//episode)[2]} returns the second \code{episode} element node (whereas the filter expression \code{[2]} is an abbreviation of \code{[position() = 2]}). To get the last element of a sequence the function \code{last()} needs to be called. For instance, the filter expression \code{[last() - 1]} returns the penultimate element of the context sequence. It is also possible to evaluate the content of the context node (or its descendant respectively its ancestor) in a filter expression. For example, \code{(1 to 100)[. mod 5 eq 0]} lists all integers from 1~to~100 that are divisible by~5 without remainder based on the context node's content. It is possible to stack as many filters as required: \code{(//episode)[2][@foo]} would return the second episode element if it would have an attribute called \code{foo}.

Using name tests and node type tests, XPath permits to select nodes by their type and name. Using the wildcard \code{*} it is possible to select elements and attributes regardless of their names. The path expression \code{//@*} selects all attribute nodes for instance. The language provides three built-in operators for combining sequences: \code{union}, \code{intersect}, and \code{except}. All these operators eliminate duplicate nodes from their result sequences based on the node identities. Additionally, XPath supports basic arithmetic expressions (\mbox{\code{+}, \code{-},~\code{*}}, \code{div}, and \code{mod}) and comparison expressions for values and nodes.

XPath shares predefined functions and operators with XQuery~\cite{XPaXQ-FaO}. They cover a wide area of application like numerics, strings, booleans, dates and times, sequences, and casting. The specification~\cite{XPaXQ-FaO} defines 68~operations and 122~functions. Function calls looks like this: \code{data(//@number)} to get the value of all \code{number} attributes or \code{doc("http://noedler.de/index.rss")} to load an XML document.

As mentioned before, XPath~2.0 supports more advanced constructs than version~1.0 does, namely loop, conditional, and quantified expressions. Using the \code{for} expression as shown in listing~\ref{xpath_for} allows to loop through the XML tree in a more flexible way than using path expressions. The example returns tuples of all Simpsons season numbers and their appropriate episode numbers which would not have been possible using XPath~1.0.

\begin{lstlisting}[style=XQuery, caption=XPath expression using the \code{for} construct, label=xpath_for]
for $s in //season,
    $e in $s/episode
return ( data($s/@number), data($e/@number) )
\end{lstlisting}

The conditional expression is an if-then-else construct that allows to react appropriately based on values in the XML tree. Quantified expressions support existential (keyword \code{some}) and universal (keyword \code{every}) quantification whose result is true or false, which allows to check that a condition is true/false for some or all nodes. The expression in listing~\ref{xpath_quant} returns \code{true}, because every \code{episode} element satisfies the condition to own a \code{number} attribute whose value is larger than zero. Quantified expressions are often used in conjunction with conditional expressions.

\begin{lstlisting}[style=XQuery, caption=XPath quantified expression, label=xpath_quant]
every $episode in //episode
satisfies $episode/@number > 0
\end{lstlisting}


\subsection{XML Query Language} \label{xquery}

\abbrev{XQuery}{XML Query Language}
\abbrev{SQL}{Structured Query Language}

The \emph{XML Query Language} (XQuery) is described from a high-level view as ``a standardized language for combining documents, databases, Web pages, and almost anything else. It is very widely implemented. It is powerful and easy to learn.''\footnote{\url{http://www.w3.org/XML/Query/}} The language is standardised by the W3C~\cite{XQuery1.0} and its aim is to use the structure of XML intelligently to express queries across all kinds of XML. It allows to select XML elements of interest, reorganise and transform them, and return them in a user-defined order and structure~\cite[Chap.~1]{XQueryWalmsley}.

Seen from a technical point of view, XQuery is for XML what SQL is for the relational databases. ``XQuery will serve as a unifying interface for access to XML data, much as SQL has done for relational data.''\footnote{\url{http://electronics.ihs.com/news/w3c-xml-approves.htm}} XQuery is derived from an XML query language called Quilt, which borrowed features from several other languages, including SQL, XQL, and XPath~1.0. In fact XQuery~1.0 is an extension of XPath~2.0 or---defined vice versa---XPath is a complete subset of XQuery~\cite{XPath2.0}.

XQuery (and therefore also XPath) is a functional and typed language, which means it is built from expressions rather than statements. ``Every construct in the language [\ldots] is an expression and expressions can be composed arbitrarily. The result of one expression can be used as the input to any other expression. [\ldots] Another characteristic of a functional language is that variables are always passed by value, and a variable's value cannot be modified through side effects''~\cite{XPaXQ-Semantics}. XQuery is a typed language, because it is capable of importing types from an XML~Schema definition and use them to perform operations based on these types, such as early detection of type errors.

The basic structure of many (but not all) queries is the FLWOR (pronounced ``flower'') expression which stands for the keywords \code{for}, \code{let}, \code{where}, \code{order by}, and \code{return} used in the expression. FLWORs, unlike path expressions, allow to manipulate, transform, and sort results. This is the main difference and advantage of XQuery compared to XPath. The latter is only capable of retrieving nodes from XML documents whereat XQuery allows to rearrange nodes and to create new ones. The following listing~\ref{xquery_example_1} uses a FLWOR expression to restructure the Simpsons XML tree (listing~\ref{listing_xml_simpsons}) and to return a sequence (listing~\ref{xquery_result_1}) containing all Simpsons episodes sorted by their title.

\begin{lstlisting}[style=XQuery, caption=XQuery expression to restructure the Simpsons example, label=xquery_example_1]
for $episode in doc("simpsons.xml")//episode
let $title := data($episode/@title)
order by $title
return element simpsons-episode { $title }
\end{lstlisting}

The query first iterates over all \code{episode} elements (line~1) of the input XML document \code{simpsons.xml} and binds each element to a variable called \code{\$episode}, reads the data value of the \code{title} attribute of the current context node \code{\$episode}, binds the value to the variable \code{\$title} (line~2), and returns each title as the content of a new \code{simpsons-episode} element (line~4) whereas the result set is ordered by the title (line~3).

\begin{lstlisting}[style=XML, caption=Resulting sequence of query~\ref{xquery_example_1}, label=xquery_result_1]
<simpsons-episode>Bart the Genius</simpsons-episode>
<simpsons-episode>He Loves to Fly and He D'oh's</simpsons-episode>
<simpsons-episode>Simpsons Roasting on an Open Fire</simpsons-episode>
<simpsons-episode>The Homer of Seville</simpsons-episode>
\end{lstlisting}

XQuery is not a pure query language as it allows the definition of user-defined function, which can also be called recursively~\cite[Chap.~8]{XQueryWalmsley}. Modules allow the grouping of functions in a reuseable way and the language itself is proven to be Turing complete~\cite{kepser2004spt}.

The XQueryX~1.0 specification~\cite{XQueryX1.0} provides an XML syntax for XQuery. Converters such as XQ2XML~\cite{XQ2XML} permit to transform XQuery to XQueryX and an XSLT stylesheet provided by the XQueryX~1.0 specification can be used to transform XQueryX back to XQuery. XQueryX allows to use the XML representation of an XQuery expression as input for XQuery or XSLT to apply transformations and queries on itself.

One drawback of XQuery is the missing support for updating parts of an XML document without re-generating the complete document. The W3C has identified this problem and is working on a draft called ``XQuery Update Facility''~\cite{XQueryUpdate}. Another approach into this direction is the XML update language XUpdate~\cite{XUpdate}.

% XQuery myths: http://www.ibm.com/developerworks/xml/library/x-xqmyth.html
% Interview With Dr. Daniela Florescu, Editor of the W3C XQuery Specification: http://www.stylusstudio.com/daniela_florescu.html


\section{Representations of Software Artefacts} \label{sw_representations}

Software needs to be represented in varying ways to fulfil different requirements. Software architects for example require a global view of all components of a system to be able to design extensible and maintainable software. Therefore they require software representations at an abstract level. In contrast, source code analysis tools need to investigate all details of a software to find design flaws or potential bugs and require a software representation that reflects the fully detailed source code.

This section introduces facilities for software representation such as metamodels (section~\ref{sw_rep_metamodels}), the \emph{Unified Modeling Language} (UML) (section~\ref{sw_rep_uml}), XML-based formats (section~\ref{sw_rep_xml}), and other common approaches (section~\ref{sw_rep_other}) like intermediate formats and tree-based source code representations. For each representation format it is discussed whether it is capable to serve as a high-level software representation and/or full detail software representation format.


\subsection{Metamodels} \label{sw_rep_metamodels}

\abbrev{MOF}{Meta-Object Facility}
\abbrev{UML}{Unified Modeling Language}
\abbrev{OMG}{Object Management Group}

A model is an abstraction of the real world and the according metamodel is the abstraction of its model. Metamodelling is the process of constructing a collection of concepts within a certain domain, for example the software domain in the field of computer science. Metamodels define languages for the specification of models~\cite{pender2002uwc}. Hence, metamodels are playing the same role for models as---for example---grammars for programming languages. Vice versa, a model is said to conform to its metamodel such as source code conforms to the grammar of the language it is written in.

Metamodels use a four-layer metamodel architecture~\cite{pender2002uwc} that can be explained by the definition of an XML language using a DTD\footnote{\url{http://www.devhood.com/tutorials/tutorial\_details.aspx?tutorial\_id=698}} (see section~\ref{xml} for details on XML and DTD):
\begin{description}
\item[Level M0] The \emph{User Object Layer} defines specific subject domain information such as the real world data to be described. In the case of a new XML language this is the content that should be contained in the XML documents.
\item[Level M1] The \emph{Model Layer} defines the language to use to describe a subject domain. For this example of an XML language, that means which elements and attributes are required to model the content and at large the complete XML document.
\item[Level M2] The \emph{Metamodel Layer} defines the language for specifying models. In case of XML, using a DTD is one way to describe the model as the DTD defines the logical structure of an XML document. Also XML Schema and RELAX~NG are located at the metamodel layer as they can be used to describe models of XML documents.
\item[Level M3] The \emph{Meta-Metamodel Layer} defines the language for specifying metamodels. For this XML example, that means this layer defines the language a DTD is written in. In the case of XML Schema it is XML again as XML Schema uses an XML syntax.
\end{description}

The advantage of this layered structure is---assuming that the meta-metamodel is rich enough---the four-layer architecture can support and model most, if not all kinds of information and meta-information imaginable.

These metamodelling foundations are provided by the \emph{Object Management Group} (OMG)\footnote{\url{http://www.omg.org}} because the OMG was in need of a metamodel architecture for the definition of the \emph{Unified Modeling Language} (UML)~\cite{UML211}. The \emph{Meta-Object Facility} (MOF)~\cite{MOF} serves as the basis for UML and makes use of the four-layer architecture discussed above. It is defined as a closed metamodelling architecture as MOF defines a M3 model, which conforms to itself. The most prominent example of a M2 model is the UML metamodel, the model that describes UML. Hence, models written in UML are located at the M1 layer.

MOF and UML provide a ``key foundation for OMG's \emph{Model-Driven Architecture} (MDA), which unifies every step of development and integration from business modeling, through architectural and application modeling, to development, deployment, maintenance, and evolution.''\footnote{\url{http://www.uml.org}} Models expressed in a well-defined notation are a cornerstone of systems for enterprise-scale solutions.\footnote{\url{http://www-128.ibm.com/developerworks/rational/library/3100.html}} The development of systems can be organised around a set of models, transformations between models, and derivations from models~\cite{seidewitz2003mm}.

Metamodels are mainly used for the representation of software artefacts at an architectural level. Different metamodel approaches for software representation emerged and can be differentiated by their granularity of the representation of the underlying software. Some, for instance the FAMIX metamodel~\cite{tichelaar2000mml}, concentrate on the most important object-oriented entities (classes, attributes, and methods) and their relationships (access and invocations). A metamodel with about the same degree of granularity is presented by Guéhéneuc~\cite{gueheneuc2002tmr} to display program architectures and behavioural information. Limiting the granularity reduces the complexity of the metamodel and provides a high-level view of the underlying software with the restriction that the source code is not completely reflected by the model. A more detailed approach for the representation of Java source code is presented by van Emden and Moonen~\cite{vanEmden}: the metamodel ``contains information regarding program entities like packages, classes, [\ldots] methods, constructors, static blocks, and fields. Furthermore, it describes the relations between these entities such as composition, inheritance, [and] method calls''~\cite{vanEmden}.

But metamodels are not limited to the representation of software on a high-level view, but can also be used to represent source code in depth. That allows for example to apply refactorings at the model level as a model transformation and to generate the refactored source code from the transformed model~\cite{vangorp2003}. The range of metamodels representing source code in depth varies from ones that try to cover different object-oriented languages~\cite{vangorp2003} to specialised ones, like a metamodel for \ttcn{}~\cite{schieferdecker2004mmt}.


\subsection{Unified Modeling Language} \label{sw_rep_uml}

The \emph{Unified Modeling Language} (UML) is a standardised general-purpose modelling language that includes a graphical notation. UML is used to create an abstract model of a system, called \emph{UML model}. The language is specified by the Object Management Group (OMG) and the latest version of UML is 2.1.1~\cite{UML211}. UML provides a generic extension mechanism to build UML models for particular domains called \emph{UML profiles} that tailor the language to specific areas.

As already described in section~\ref{sw_rep_metamodels}, UML is formally defined using a Meta-Object Facility metamodel that defines all constructs of UML. The metamodel itself is expressed in UML. This is an example of a metacircular definition as the language is defined in terms of itself. But things are not completely circular, only a small subset of UML is used to define the metamodel~\cite{quatrani2006vmi}. But these internals are only important to those who use UML as a programming language, as it defines the abstract syntax of that language~\cite{fowler2004udt}.

More important is the overall impact of UML in the field of software engineering regarding modelling software. While analysis and design phases are known by nearly all software development processes, using UML allows to perform these phases on a unified basis. UML can be used in nearly all phases of a software development process: use case and activity diagrams in the analysis and requirements phase and class diagrams during the design phase are just a few possible applications~\cite{fox2006ise, sommerville}. A couple of development processes are based even completely on UML respectively models, for example the model-driven development that is part of OMG's model-driven architecture. The extensive usage of models starts with the automatic generation of source code stubs and can lead to the use of executable models~\cite{balcer2002euf}. In most cases, UML is used for designing and representing software on an architectural level and not as a representation format for full detailed source code.

The development process is one part of a software engineering method. Other parts are a common vocabulary and a set of guidelines. The vocabulary is used to describe the process and the products created during the application of the process. The part of all software engineering methods that can be standardised is the vocabulary, often expressed in a notation. UML is a common notation that may be applied to many different types of software projects using different methods~\cite{pender2002uwc}.

After having a look at the definition and the possible applications of UML, the different diagram types are briefly introduced. UML provides a set of thirteen predefined types of diagrams for different purposes. They are grouped into three categories\footnote{\url{http://www.omg.org/gettingstarted/what\_is\_uml.htm}}: 

\begin{enumerate}
\item Structure diagrams represent static application structure and include the class diagram, object diagram, component diagram, composite structure diagram, package diagram, and deployment diagram. 
\item Behaviour diagrams represent general types of behaviour and include the use case diagram, activity diagram, and state machine diagram. 
\item Interaction diagrams represent different aspects of interactions. They are all derived from the more general behaviour diagram and include the sequence diagram, communication diagram, timing diagram, and interaction overview diagram.
\end{enumerate}

\label{uml_classes_text}
To give an example of how UML looks, figure~\ref{uml_classes} shows a UML class diagram containing two classes. Class \code{Person} is the base class containing the attribute \code{name} and the according operation \code{getName()} to access the private class attribute. The class \code{Employee} inherits from the base class \code{Person} and adds the attribute \code{salary} and the according getter operation.

\input{pstricks/ch2_uml_classes}

What makes UML special among other representation formats for software artefacts (presented in this section) are two properties: First, UML is a general-purpose modelling language that can be used for business modelling and modelling of other non-software systems. But UML is mainly designed for modelling software what makes it different from other representations formats that are generic approaches reused for the representation of software. Second, UML is a graphical approach. Even though it is important to distinguish between the UML model and the set of diagrams as graphical representations of the models, UML is mostly used on a graphical level. While all other representation formats can also be visualised, the visualisation is not an integral part of these formats.


\subsection{XML-based Representations} \label{sw_rep_xml}

The Extensible Markup Language (XML, see section~\ref{xml} for details) is not only used for exchanging structured information but serves also as a language for representation formats. This section shows how XML is used to represent software on an architectural level and also how to represent fully detailed source code. The first part covers the standardised language \emph{XML Metadata Interchange} (XMI) that is mainly used for the representation of UML models as XML, while the second part covers the usage of XML to encode source code. 

The reasons to use XML for the representation of software artefacts are various: XML is a free and open standard that is platform-independent and commonly used. But even more important is availability of a wide range of XML processing languages such as XQuery and XSLT and XML-aware tools.

\subsubsection{XML Metadata Interchange} \label{sw_rep_xmi}

\abbrev{XMI}{XML Metadata Interchange}

The \emph{XML Metadata Interchange} (XMI)~\cite{XMI21} is an XML language standardised by the Object Management Group for serialising and exchanging metadata information such as metamodels across platforms and systems. The most common usage of XMI is as an exchange format for UML models and metamodels, although XMI can also be used for serialisation of non-UML models and metamodels. In general XMI can be used for any metadata whose metamodel can be expressed using the Meta-Object Facility (MOF) on the M3, M2, or M1 layer.

XMI bridges the gap between generic objects (for example models/metamodels) and XML by defining a new domain-specific XML language to improve the usage of XML for the interchange of objects~\cite{grose2002mxj}. The flexibility of XML allows to map objects to XML in nearly infinite possible ways. XMI addresses this property of XML and acts as a gateway by defining standards \emph{how} the mapping from an object to XML elements/attributes should be done. XMI also defines the reverse mapping to restore the original object from an XMI document. XMI specifies how to create XML Schema and Document Type Definitions (DTD) from a model and vice versa how to create a model from a given DTD or XML Schema~\cite{grose2002mxj}. 

% In the OMG vision of modelling, data is split into abstract models and concrete models. The abstract models represent the semantic information, whereas the concrete models represent visual diagrams. Abstract models are instances of arbitrary MOF-based modelling languages such as UML or SysML. For diagrams, the Diagram Interchange (DI, XMI[DI]) standard is used.

% A UML model may state that there is a class named "Person" in a model, but it cannot state that this class is represented in a diagram by a rectangle in a certain position, size, and color. To remedy this situation the XMI[DI] [18] standard has been proposed. XMI[DI] is not a model interchange format but a metamodel to describe diagram information.~\cite{alanen2005miu}

Listing~\ref{xmi_uml_classes} shows the XMI representation of the class \code{Employee} of the UML class diagram shown in figure~\ref{uml_classes}. It was created using the open-source UML modelling tool ArgoUML~0.24~\cite{ArgoUML} that uses XMI as its default serialisation format for UML models. A lot of details (XMI elements and attributes) are left out, to concentrate on the main structure of the XMI document. Additionally, the values of the \code{xmi.id} and \code{xmi.idref} attributes have been changed from automatically generated identifiers to better readable ones. The first part of this snippet of the XMI document represents the class \code{Employee} (lines~1--7), its attribute \code{salary}, and operation \code{getSalary} (lines~4 and~5). Line~2 references the generalisation of the class \code{Employee} which is represented in the second parts of the XMI snippet (lines~8--15). The inheritance relationship of the classes \code{Employee} and \code{Person} is expressed as a child-parent relation in which the class \code{Employee} is the child (lines~9--11) of the parent class \code{Person} (lines~12--14). 

\begin{lstlisting}[style=XML, caption={XMI representation of the UML class diagram figure~\ref{uml_classes}}, label=xmi_uml_classes]
<UML:Class xmi.id='ClassEmployee' name='Employee' visibility='public'>
	<UML:Generalization xmi.idref='GeneralizationPersonEmployee'/>
	<UML:Classifier.feature>
		<UML:Attribute name='salary' visibility='private'/>
		<UML:Operation name='getSalary' visibility='public'/>
	</UML:Classifier.feature>
</UML:Class>
<UML:Generalization xmi.id='GeneralizationPersonEmployee'>
	<UML:Generalization.child>
		<UML:Class xmi.idref='ClassEmployee'/>
	</UML:Generalization.child>
	<UML:Generalization.parent>
		<UML:Class xmi.idref='ClassPerson'/>
	</UML:Generalization.parent>
</UML:Generalization>
\end{lstlisting}

Applications using XMI as their primary serialisation format are for example the UML modelling tool ArgoUML~\cite{ArgoUML} as seen above and the \emph{Eclipse Modeling Framework} (EMF)~\cite{EMF}. EMF extends the usage of XMI by not only storing metamodels or models but also the data of these models (the M0 layer in the four-layer metamodel architecture, see section~\ref{sw_rep_metamodels} for details) what is not covered by the XMI specification.

One often raised point of critique regarding XMI is that the XMI standard is not detailed enough and leaves to much space for implementation-specific variations~\cite{alanen2005miu}. That causes problems when it comes to exchanging metadata using XMI between products of different vendors. In practice this means exchanging XMI files between UML modelling tools from different vendors using XMI is rarely possible~\cite{alanen2005miu}.


\subsubsection{Encoding Source Code as XML} \label{sw_rep_xml2}

\abbrev{JavaML}{Java Markup Language}
\abbrev{srcML}{Source Code Markup Language}
\abbrev{GCC}{GNU Compiler Collection}
\abbrev{GNU}{GNU is not Unix}

This section introduces the usage of XML for the fully detailed representation of source code. As the software analysis of this thesis is XML-based, the two most important varieties of XML representation formats are discussed in depth in this section.

Even though a lot of approaches exist to encode source code as XML, two basic concepts can be distinguished. The first concept maps all constructs of the original programming language to according XML elements/attributes. User-defined data such as variable names is stored as the content of these XML elements/attributes. This approach is called \emph{representation concept} as it provides a new representation of the source code. The second concept is called \emph{annotation concept} as it literally marks-up the original source code using XML elements without losing the original source code formatting. 

In particular for the encoding of Java source code as XML a lot of approaches are available. Two of these approaches are presented in depth to show the main differences between the representation and annotation concept. The following listing~\ref{java_original} contains the original source code of a Java class that will be encoded as XML.

\begin{lstlisting}[style=Java, caption=Java source code to be encoded as XML, label=java_original]
public class Person {
	private String name;
	public String getName() {
		return name;
	}
}
\end{lstlisting}

Listing~\ref{java_xml_badros} shows the encoded Java source code in an XML representation called \emph{JavaML} (Java Markup Language) that was developed by Badros~\cite{badros2000jml}. It uses the representation concept to map all language constructs of Java to XML elements. All constructs of the source code are mapped to XML elements or attributes. For example, the source code \code{public class Person} (listing~\ref{java_original}, line~1) is mapped to the XML element \code{class} (listing~\ref{java_xml_badros}, line~1) with the attributes \code{name="Person"} and \code{visibility="public"}. Listing~\ref{java_xml_badros} is simplified by leaving out a few XML attributes for reasons of clarity. JavaML typically encodes also information about the location of each statement in the original source code (using line, end-line, column, and end-column attributes), adds an identifier to each statement that allows to reference it using the \code{idref} attribute, and is also capable of representing Java comments.

\begin{lstlisting}[style=XML, caption={JavaML representation of listing~\ref{java_original}}, label=java_xml_badros]
<class name="Person" visibility="public">
	<superclass name="Object"/>
	<field name="name" visibility="private">
		<type name="String"/>
	</field>
	<method name="getName" visibility="public">
		<type name="String"/>
		<formal-arguments/>
		<block>
			<return><var-ref name="name"/></return>
		</block>
	</method>
</class>
\end{lstlisting}

An instance of the annotation concept is \emph{srcML} (Source Code Markup Language)~\cite{maletic2002scf, maletic2004lxt} which allows to markup Java, C/C++, and AspectJ source codes using XML. The following listing~\ref{java_xml_srcml} contains the result of the annotation of the example Java source code. All constructs and also the formatting of the original source code are preserved and XML elements are put around the Java constructs. In line~1 of listing~\ref{java_xml_srcml} the start of the Java class is marked-up using a \code{class} element, the visibility of the class is marked-up with a \code{specifier} element and the name of the class with a \code{name} element.

\begin{lstlisting}[style=XML, caption={srcML representation of listing~\ref{java_original}}, label=java_xml_srcml]
<class><specifier>public</specifier> class <name>Person</name> <block>{
	<decl_stmt><decl><type>private <name>String</name></type> <name>name</name></decl>;</decl_stmt>
	<function><type>public <name>String</name></type> <name>getName</name><parameter_list>()</parameter_list> <block>{
		<return>return <expr><name>name</name></expr>;</return>
	}</block></function>
}</block></class>
\end{lstlisting}

As srcML preserves the formatting of the original source code, the resulting XML looks not as structured as the JavaML representation in listing~\ref{java_xml_badros}. This might be seen as a drawback, but as most of the time the XML data is processed by XML tools and not by humans, this formatting issue does not matter. It is an advantage of the annotation concept that the original source code can be recovered with small effort by removing all XML elements. While it requires parsing the XML file and mapping it back to source code when using representation concepts like JavaML. Because of the additional effort, many instances of the representation concept do not support a regeneration of the original source code from the XML representation at all.

One property is often missing in both concepts: the line numbers of the original source code elements are often not mapped to the XML. That makes it nearly impossible to reference from XML elements to the according elements in the original source code. This feature is missing in srcML and even though the original formatting of the source code is preserved, it is impossible to reconstruct the exact position of an element (line number and offset) in the original source code using standard XML technologies like XQuery or XSLT. JavaML in contrast has an optimal support for this feature encoding the starting and ending line and offset for each element of the source as XML attributes.

While the users of JavaML cannot influence the granularity of the mapping to an XML document, srcML offers options to mark-up literals and operators which are not be marked-up by default. Altogether, it can be asserted that the overall granularity of the resulting XML documents is higher for representation concepts like JavaML than for annotation concepts like srcML. 

As mentioned at the beginning of this section, there are various approaches for the representation of Java source code as XML. Unfortunately, most of them are named \emph{JavaML}. JavaML by Badros~\cite{badros2000jml} presented in detail above and is superseeded by an enhanced version \emph{JavaML~2.0} also by Badros~\cite{aguiar2004jem}. JavaML by Mamas and Kontogiannis~\cite{mamas2000tps} follows the representation concept and is very likewise compared to JavaML by Badros. The separability of JavaML by McArthur, Mylopoulos, and Keith Ng~\cite{mcarthur2002ets} is the usage of a multi-weight parser which allows to remain some syntactic constructs unparsed. This relates to the mark-up options of srcML but based on the representation and not the annotation concept. The \emph{Extensible Software Document Markup Language} (XSDML) by Maruyama and Yamamoto~\cite{maruyama2004ctp} is conceptually not bound to, but currently implemented for Java. XSDML uses the annotation concept as srcML and is in contrast to srcML capable to encode line and offset information. Also additional semantical information such as fully-qualified names and references can be encoded. These properties make XSDML an intersection of srcML and JavaML.

Most XML formats for source code representation are often designed for \emph{one} programming language, like JavaML for Java. A few support multiple source languages, like srcML (Java, C/C++, and AspectJ). Other approaches such as \emph{OOML} (Object Oriented Markup Language)~\cite{mamas2000tps} use a higher level of abstraction and allow to map different object-oriented languages to one XML representation. The \emph{Extensible Common Intermediate Language} (XCIL)~\cite{kothari2004pbf} is also developed to provide a common semantic representation that is independent of the target language. Therefore all structural elements are based on corresponding UML definitions and are mapped to XML. Because of that ``XCIL and XMI are virtually identical except with respect to their representation of elements from the action semantics''~\cite{kothari2004pbf}.

Another approach to encode source code as XML---that relates to the representation concept---is to map an abstract syntax tree (AST, see section~\ref{sw_rep_trees} for details) to XML~\cite{zou2001tpx}. As ASTs mostly contain the line numbers and offsets for all statements of the underlying programming language, these information can be mapped to XML attributes. Only semantically relevant parts of the source code (like statements and identifiers) are represented in the AST. Comments and blank lines are not part of it and can therefore not be mapped to the XML what prohibits reconstruction of the original source code. An advantage of this approach is the very high granularity of the AST and the resulting XML, allowing to perform detailed analyses.

\label{xml_graph}

XML is not only used for the representation of static software artefacts (like source code or architectural models) but also for the representation of graphs like control flow and data flow graphs. One approach for the representation of graphs as XML is presented by Al-Ekram and Kontogiannis~\cite{alekram2005xbf}. They propose different XML formats for each kind of graphs: \emph{Control Flow Graph Markup Language} (CFGML), \emph{Program Dependence Graph Markup Language} (PDGML), and \emph{Call Graph Markup Language} (CGML). All of them are based on an intermediate XML format called FactML that is used to unify different input formats like JavaML and OOML.

A general approach for the representation of any kinds of graphs as XML is the \emph{Graph Exchange Language} (GXL)~\cite{HoltWinter2000} that is developed to enable interoperability between software reengineering tools and components. The tool XOgastan~\cite{antoniol2003xxo} permits to transform the AST created by the C/C++ compilers of the \emph{GNU Compiler Collection} (GCC) to a GXL document. XOgastan also includes special features like the extraction of control flow graphs. Hence, one application of GXL is the representation of ASTs and control flow graphs. GXL is also used as backend for an ``infrastructure that supports interoperability among reverse engineering tools''~\cite{kraft2007isi} what is important as interoperability is a ``contributing factor to the lack of adoption of available [reengineering] infrastructures''~\cite{kraft2007isi}. Using approaches like XML in general and GXL in particular, it is also possible to encode dynamic software behaviour as traces of program executions~\cite{marburger2003bat}.

% ``Venice UML Visualization Tool'' uses GXL


\subsection{Other Common Approaches} \label{sw_rep_other}

The representation of software artefacts is not limited to high-level formats such as UML and metamodels or to XML-based formats which are described in the preceding sections. Other common approaches for the representation of software artefacts are presented in the following sections: the usage of plain source code (section~\ref{sw_rep_sourcecode}), intermediate formats (section~\ref{sw_rep_intermediate}), and tree structures (section~\ref{sw_rep_trees}).

\subsubsection{Plain Source Code} \label{sw_rep_sourcecode}

Source code is not directly executable---it always requires a compiler or at least an interpreter to execute it. Therefore, source code is strictly spoken nothing but the serialisation of software or a representation format. Hence, the most obvious representation format for software is source code itself. To differentiate between source code as a target of software analysis and source code as representation format, the term \emph{plain source code} is used for the latter.

Plain source code can be used for similar purposes like other representation formats that are capable of reflecting source code in depth. Plain source code is also used to query it for anomalies using regular expressions or other lexical and syntactical approaches. See section~\ref{find_re} for details regarding this.

\subsubsection{Intermediate Formats} \label{sw_rep_intermediate}

Source code is not always directly compiled to executables or executed using an interpreter. Intermediate formats are located between the source code level and the actual execution of a program. One prominent example of an intermediate format is the Java bytecode that is the result of the compilation of Java source code. The Java bytecode (stored in class files) cannot be directly executed and needs to be executed by a \emph{Java Virtual Machine} (JVM) instead~\cite{lindholm1999jvm}. The same concept is used by Microsoft's .NET platform which is a pendant to the Java runtime environment. Microsoft's intermediate format is called \emph{Common Intermediate Language} (CIL) and fulfils the same role as the Java bytecode: it is a platform-independent representation format of the underlying source code that can be executed by any virtual machine that is capable of the intermediate format.

Intermediate formats can serve as a basis for multiple programming languages. Tool vendors only need to build compilers that transform the particular programming language to the intermediate format instead of providing more complex compilers that generates executables. Microsoft's .NET platform extensively uses this approach and transforms all .NET programming languages (for instance C\# and VB.NET) to the Common Intermediate Language (CIL). As the specifications of CIL and of the Java bytecode are publicly available, third-party programming languages are able to make use of the intermediate formats and their runtime environments. For example, IronPython\footnote{\url{http://www.codeplex.com/Wiki/View.aspx?ProjectName=IronPython}} allows Python code to run on the .NET platform and JRuby\footnote{\url{http://jruby.codehaus.org}} allows to run Ruby code using a JVM\footnote{\url{http://www.robert-tolksdorf.de/vmlanguages.html} provides a list of about 200 different projects using Java bytecode respectively the JVM as their runtime environment.}. This property of intermediate formats can be of interest regarding software analysis. As the intermediate formats provide an abstraction from the underlying programming language an analysis that is based on an intermediate format is not bound to a concrete language.

For example, the \emph{Byte Code Engineering Library} (BCEL)~\cite{BCEL} allows to access Java bytecode and acts as ``a convenient possibility to analyze, create, and manipulate (binary) Java class files [\ldots]. Classes are represented by [Java] objects which contain all the symbolic information of the given class: methods, fields, and byte code instructions, in particular.'' Using BCEL, it becomes easily possible to use Java bytecode as a starting point for the software analysis~\cite{FindBugs04, maruyama2004ctp}. Particularly with regard to closed-source software---where only the Java bytecode is available---the usage of intermediate formats gets interesting.

% Another Java bytecode analysis and manipulation library: BAT2 (http://www.st.informatik.tu-darmstadt.de/BAT) also supports XML export

\subsubsection{Tree Structures} \label{sw_rep_trees}

\abbrev{AST}{Abstract Syntax Tree}

The structure of a programming language is predefined by its grammar that defines the allowed language constructs and their combinations. The recognition of the grammar in a piece of source code is called parsing. The result of the parsing process is a \emph{Parse Tree} which is a hierarchical representation of the derivations of the source code from its grammar. An \emph{Abstract Syntax Tree} (AST) is a more economical representation of the source code, abstracting out the redundant grammar productions of the parse tree. These are for instance comments or grouping parentheses that are already expressed by the tree structure. Parse trees and abstract syntax trees are the most important tree-based data structures for the representation of source code. These trees are integral parts of nearly all compilers and interpreters as intermediate representations of the program. The trees are the abstraction of the source code in terms of the language's grammar and hence strongly dependent on the underlying programming language. An example of an AST (encoded using XML) is provided in listing~\ref{xml_ttcn3_example}.

The transformation process from the source code to an AST consists mainly of a lexical, syntactical, and semantical analysis. In the first step, the lexical analysis is performed and the source code is converted into a token stream. Each token (also called symbol) represents a single expression of the underlying grammar and gets classified (operators, identifiers, \ldots). In the second step the token stream is converted into a parse tree that represents the structure of the program. During this convertion, the syntax of the program gets validated and a differentiation between declarations, statements, expressions, etc. is carried out. Finally, the parse tree is converted into an AST that is a finite, labeled, and directed tree, where the interior nodes of the tree represent the non-terminals and the leaves terminal symbols of the grammar. During the semantical analysis a symbol table is created in which each identifier of the source code is associated with information relating to its declaration or appearance in the source, like its type and scope.


\chapter{Related Work} \label{ch_relatedwork}

This chapter covers related work regarding software analysis in the area of pattern detection in general and approaches specific to the detection of bad smell in particular. Approaches based on source code metrics (section~\ref{find_metrics}), regular expressions (section~\ref{find_re}), and logic programming (section~\ref{find_logic}) are presented followed by domain-specific languages (section~\ref{find_dsl}), XML-based approaches (section~\ref{find_xml}), and other common approaches (section~\ref{find_other}). Finally, a summary of this chapter is given and the results are discussed (section~\ref{find_summary}).

Each of the sections is made up of a few representative approaches which are presented by giving examples and discussing their properties for example regarding matching power, robustness, reusability, and limitations. Their technical realisations are not discussed in depth but mentioned briefly. The main focus is set to the detection of patterns which can be discovered using static code analysis. This is done as the software analysis framework presented and discussed in the chapters~\ref{ch_req_and_design} and~\ref{ch_impl} is also focused on static analysis.


\section{Source Code Metrics} \label{find_metrics}

This section presents approaches for the detection of pattern using source code metrics. (The terms bad smell and metric have been introduced in section~\ref{patterns}.) Even though metrics cover software measurement while bad smells are related to the structure of source code, useful intersections between both patterns exist and are described in the following. The result of a metric is by default a descriptive value that should be linked to a boundary value to become prescriptive~\cite{FY94}. Applying the metric \emph{Lines of Code} to a Java class can lead to a descriptive result of 1500~lines of code. If a Java class is considered to be too long if it contains for example more than 1000~lines of code, this boundary value is linked to the metric which can now be used to find instances of too large classes.

The connection between metrics and bad smells is that a certain boundary value can indicate the instances of bad smells. The example in the preceding paragraph shows that the metric \emph{Lines of Code} can be used to find instances of the bad smell \emph{Large Class}. Hence, metrics with a linked boundary value might be seen as a subset of bad smells as they also allow to detect structures in the code that should be refactored. The other way around, also bad smells might be seen as a subset of metrics as the number of found smells often equals the result of a metric. 

An immense number of metric tools\footnote{For example: \url{http://metrics.sourceforge.net}, \url{http://www.clarkware.com/software/JDepend.html}, and \url{http://www.mmsindia.com/jstyle.html}} for all kinds of target languages but \emph{without} automatic smell detection is available. Only a small number of approaches make use of source code metrics for the \emph{automatic} detection of bad smells~\cite{atkinson2005ldp, carneiro1988adr, simon2001mbr}. But the scope of metrics regarding smell detection is limited as they ``do not comprise the full spectrum of possible smell symptoms and still are uncertain''~\cite{walter}. Hence, more advanced approaches---like those discussed in the following sections---are required for a powerful smell detection. These approaches are focused on the internal structure of the source code and are not limited to the measurement results of metrics.


\section{Regular Expressions} \label{find_re}

\abbrev{RE}{Regular Expression}

\emph{Regular Expressions} (RE) are expressions describing sets of strings and are used for pattern matching. REs feature their own syntaxes to express the patterns to match. The most known regular expression syntax is the one of the programming language Perl~\cite{Perl}. The basic concepts of regular expressions---wildcards, alternation, grouping, quantification, and meta-variables---are extended by Perl with character classes, modifiers, backreferences, and replacements amongst others.\footnote{\url{http://search.cpan.org/dist/perl/pod/perlre.pod}} Perl-compatible regular expressions are nowadays widely adopted and implemented by nearly all newsworthy programming languages like Java, Python, Ruby, and the .NET platform.

The term ``regular expression'' has different meanings for the theory of formal languages and for pattern matching. 
Regarding formal languages, regular expressions describe the according language to the grammar the formal language is based on. Most programming languages are based on context-free grammars which are located at the type-2 level in the Chomsky--Schützen\-berg\-er hierarchy~\cite{cfl}. Additionally to type-2 grammars, most definitions of programming languages make use of static semantics which provide additional contextual constraints for the language. REs regarding pattern matching are type-3 languages according the Chomsky--Schützen\-berg\-er hierarchy and therefore less expressive. Hence, the expressive power of regular expressions is not sufficient to match patterns of programming languages which are based on type-2 grammars. 

The query power of RE-based tools like \code{grep} or \code{AWK} and strict lexical approaches in general is too limited regarding the source code domain. ``General string-searching tools can handle only trivial queries in the context of source code. Based on regular expressions, these tools do not exploit the rich syntactic structure of the programming language''~\cite{buss1994ire}. Therefore, the core concepts regarding pattern matching of regular expressions have been adopted by special-purpose languages which allow pattern detection in source code of programming languages based on context-free grammars.

\paraskip

\abbrev{SCRUPLE}{Source Code Retrieval Using Pattern Languages}

One of these languages is part of the source code search system SCRUPLE (Source Code Retrieval Using Pattern Languages)~\cite{paul1994fsc}. Its pattern-based query language can be used to specify complex structural patterns of source code as it works on a syntactical instead of a lexical representation of the source code. The pattern language provides flexibility regarding the degree of precision to which a code structure is specified.

The query language is an extension of the programming language under investigation---currently~C. The extensions include regular expression-like meta-variables which can be used as substitutes for syntactic entities in the programming language. Such as statements (\code{@}), declarations (\code{\$d}), expressions (\code{\#}), functions (\code{\$f}), and variables (\code{\$v}). The pattern-matching engine of SCRUPLE searches the source code for fragments that match the patterns. Listing~\ref{SCRUPLE_sample} shows two examples of the pattern language. The pattern in line~1 matches three \code{if} statements that follow one after another. The pattern in line~2 finds all functions that have references to the identifier \code{foo}.

\begin{lstlisting}[style=none,caption=Example of the pattern-based query language of SCRUPLE, label=SCRUPLE_sample]
if @ #; if @ #; if @ #;
$t $f_x <foo> ($v*) { @* }
\end{lstlisting}

While the query language is quite powerful, it is specific to the underlying programming language. There is no abstraction between queries and the source code to query. Hence, applying existing patterns to a new programming language requires additional effort.

\paraskip

Another language making use of the core concepts of regular expressions is \emph{TAWK}~\cite{atkinson2006epm} which is based on the \code{AWK} language. TAWK uses a language-independent pattern syntax which combines the lexical power of \code{AWK} with matching support for abstract syntax trees. ``Retargeting to a new language requires [\ldots] no special effort [\ldots] to the pattern matcher itself, since it is language independent''~\cite{atkinson2006epm}. This paper also provides a comparison of different further lexical and syntactical languages and software analysis systems like LSME (Lightweight Source Model Extractor)~\cite{LSME}, SCRUPLE~\cite{paul1994fsc}, and GENOA~\cite{GENOA} regarding their expressive power, programming power, robustness, and speed.


\section{Logic Programming} \label{find_logic}

\abbrev{LMP}{Logic Meta-Programming}
\abbrev{GenTL}{Generic Transformation Language}

This sections covers approaches for the pattern detection that make use of logic programming and \emph{Logic Meta-Programming} (LMP). While the first is the usage of logic programming languages (typically variants of Prolog) for analysing and/or transforming software artefacts~\cite{AK07}, LMP is a combination of logic programming and the analysis target itself. Both approaches require fact-based representations of the underlying programming languages to express analyses using logic rules and predicates. As this thesis attends software analysis, the transformation capabilities of logic programming are not taken into account.

An example for the usage of logic programming is \emph{JQuery}---a query-based browser for Java source code~\cite{devolder2006jgc}. Queries are formulated using a modified version of the Prolog-like logic programming language \emph{TyRuBa}~\cite{TyRuBa}. ``A~set of TyRuBa predicates and rules have been defined that make up the JQuery query language. These predicates operate on a fact database that is generated from~[\ldots]~abstract syntax trees (for~.java files) and parsing Java bytecode (for~.class files).''\footnote{\url{http://jquery.cs.ubc.ca/documentation/overview.html}}

Listing~\ref{jquery_sample} shows a query that makes use of the JQuery predicates to find all public getter methods of all Java classes. First all classes are selected and bound to the variable \code{?C}, afterwards all methods of the current class are bound to \code{?M}. Finally, all methods whose name matches the regular expression \code{/\char94get/} and whose modifier is \code{public} are returned.

\begin{lstlisting}[style=none,caption=JQuery query to find all public Java getter methods, label=jquery_sample]
class(?C), method(?C, ?M), re_name(?M, /^get/), modifier(?M, public)
\end{lstlisting}

An approach for software analysis based on logic-meta programming is the \emph{Generic Transformation Language} (GenTL)~\cite{GenTLpaper, GenTL}. Although its name states that GenTL is a transformation language, it also provides analysis capabilities. GenTL make use of logic-based programming and extends it by so called \emph{Concrete Syntax Patterns} (CSP)\footnote{\url{https://sewiki.iai.uni-bonn.de/research/gentl/concepts/predicates}}. CSPs are snippets of the programming language to analyse which contain meta-variables. Those meta-variable are logic predicates and act as placeholders for expressions of the underlying programming language. Currently, GenTL is adapted for Java but its core concepts are language-independent~\cite{AK07}.

Listing~\ref{gentl_sample} provides two GenTL examples. In line~1 a pattern is used to find all \code{if} statements. The pattern in line~2 finds all classes and binds them to the user-defined meta-variable \code{?allClasses}. ``Since CSPs match at the AST level, matching is not restricted to lexical structures''~\cite{AK07}. Hence, the pattern in line~1 matches also the source code \code{if (done) return;} which does not include the curly braces of the pattern. As CSPs do not need to specify all elements of the matched element, the pattern in line~2 also matches abstract classes even though this keyword is not contained in the pattern.

\begin{lstlisting}[style=none,caption={[GenTL examples: logic predicates and syntax patterns]Two GenTL examples: combination of logic predicates and syntax patterns}, label=gentl_sample]
if (?expr) { ??statements }
?allClasses is [[ class ?classname { ??class_members } ]]
\end{lstlisting}

The concept of LMP combines logic predicates with the syntax of the analysis target. The resulting queries look much more like the analysis target itself. Hence, they are easier to understand than pure logic-based queries like those of JQuery. ``Thus the concept of meta-variables is all that programmers have to learn in addition to mastering the analysed language''~\cite{GenTLpaper}.

A lot more approaches than the presented ones are based on logic programming: SOUL~\cite{tourwe2003iro} is a variant of Prolog and used for the detection of bad smells like \emph{Obsolete Parameter} and \emph{Inappropriate Interfaces}. ASTLOG~\cite{crew1997ale} is a logic-based language for examining abstract syntax trees. Even though ASTLOG is a Prolog variant, it avoids the overhead of translating the source code into the Prolog database and works directly on abstract syntax trees. CodeQuest~\cite{hajiyev2006css} combines the usage of logic programming and database systems. This is done by mapping Datalog queries (also a Prolog-like language) to SQL. A comprehensive overview of the available logic-based approaches regarding source code analysis and transformation (amongst others CodeQuest and JQuery) provides~\cite{kniesel2007clb}.


\section{Domain-Specific Languages} \label{find_dsl}

\abbrev{DSL}{Domain-Specific Languages}
\abbrev{PQL}{Program Query Language}

\emph{Domain-Specific Languages} (DSL) are programming languages which perform specific kinds of tasks. In contrast to general-purpose programming languages the scope of DSLs is limited by design. For just that reason, DSLs are superior in their domain compared to general-purpose languages. This sections presents a couple of languages specific to the domain of pattern detection in software artefacts.

\emph{Jackpot}~\cite{Jackpot} is an extension of the open-source development environment \emph{NetBeans}~\cite{Netbeans} which adds extensible refactoring capabilities for Java source code to the IDE. User-defined refactorings can be contributed by writing new Java classes according to the Jackpot API or by making use of Jackpot's rule language\footnote{\url{http://jackpot.netbeans.org/docs/rule-language.html}}. This language is not intended to be a complete language, but it serves the need to define pattern matching and replacement for Java source code. The basic construct of the language is \code{<pattern> => <replacement>;}. This read as: if the pattern matches, substitute it with the replacement. The pattern consists of Java constructs and meta-variables to provide wildcard-like matching of the source code. Listing~\ref{jackpot_sample} shows two examples: the pattern in line~1 shows how boolean expressions can be simplified by applying DeMorgan's Theorem. The example in line~2 uses the optional guard expression \code{::} which allows to formulate conditions under which a replacement should happen. In this case: if the object bound to the meta-variable \code{\$object} is an instance of the type \code{\$type}, the casting of this object can be removed as it is not necessary.

\begin{lstlisting}[style=none,caption=Two examples using the Jackpot rule language, label=jackpot_sample]
!($a <= $b) => $a > $b;
($type)$object => $object :: $object instanceof $type;
\end{lstlisting}

Even though the Jackpot rule language is a transformation language and no pure query language (beside the workaround to replace the pattern by itself) it is a demonstrative example for a domain-specific language related to software analysis.

\paraskip

The \emph{Program Query Language} (PQL)~\cite{PQL, martin2005fae} allows to express queries regarding sequences of events associated with a set of related objects in an application-specific context. The focus of PQL is software behaviour instead of software structure as it allows to track method invocations and accesses in related objects. Therefore PQL abstracts the program execution as a trace of events. PQL queries are patterns to be matched on the execution trace and actions to be performed upon matches. A matching query is a set of objects and a subsequence of the trace that together satisfy the pattern.

PQL is currently adapted for the analysis of Java systems and therefore PQL queries partially look like pieces of Java source code. Listing~\ref{pql_sample} shows a query which finds all occurrences where a Java OutputStream object is not shut done correctly before the surrounding method ends (lines~2--5). This is expressed as an object that is instantiated (line~3) but not terminated using its \code{close} method (expressed by the sign \url{~} in line~4). PQL is not a pure query language and can also be used for transformations. In this example the missing call to the method \code{close} can be added at the end of the method under investigation (line~6).

\begin{lstlisting}[style=none,caption=Simplified PQL query to detect potential resource leaks, label=pql_sample]
uses object OutputStream o;
matches {
	o = new OutputStream();
	~o.close();
}
executes o.close();
\end{lstlisting}

To enable those high-level queries, the PQL system hides a lot of complexity: PQL queries are represented by state machines and all inputs and results for the static analysis are stored in a relational database. Internally, static analyses are performed by translating PQL queries into Datalog (a~subset of Prolog) and using the database to resolve the queries.

While the most other software analysis systems permit patterns to be matched only against source code, a key contribution of PQL is its pattern matcher which ``combines object-based parametric matching across widely-spaced events''~\cite{martin2005fae}.

\paraskip

\abbrev{SCA}{Source Code Algebra}

The originators of SCRUPLE (see section~\ref{find_re}) found out that ``one of the fundamental problems designers of source code querying systems face is the lack of good underlying models to represent source code information and to express queries. [\ldots] We found that no satisfactory choice for the underlying model to represent program information was available''~\cite{paul1994fsc}. The desired source code query system should be able to answer queries based on global structural information, on syntactical structure, and on program flow information. Most approaches lack of either a powerful query language or adequate modeling power due to ``the absence of clean formalisms for modeling and querying source code''~\cite{paul94sca}.

To address these issues a completely different approach is chosen: modeling source code using an algebra. The \emph{Source Code Algebra} (SCA) plays the same role for source code as the relational algebra plays for relational databases. The algebra is developed as a theoretical foundation for a powerful source code query system~\cite{paul94sca, paul96sca}. The benefits of using SCA include the integration of structural and flow information and the ability to process high-level source code queries using SCA expressions. Two sample queries are shown in the following.

\begin{itemize}
\item Find all functions defined in the file \code{analyzer.c}:\\
$funcs(pick_{name='analyzer.c}(FILE))$
\item Find all functions directly or indirectly called by the function \code{sort}:\\
$closure_{calls}(pick_{name='sort}(FUNCTION))$
\end{itemize}

Even though the domain model of SCA depends on the underlying source code and must be adapted to the specifics of each programming language, the SCA query language itself is domain-independent. This is a valuable feature and essentially means that an implementation of a SCA query processor works unchanged across different SCA domain models~\cite{paul94sca}.


\section{XML-based Approaches} \label{find_xml}

\abbrev{SAX}{Simple API for XML}
\abbrev{DOM}{Document Object Model}
\abbrev{StAX}{Streaming API for XML}
\abbrev{TQL}{Tree Query Language}

As seen in section~\ref{sw_rep_xml} a lot of approaches for the representation of software as XML exist. All levels are covered: high-level and full-detail XML representations are available. The range of XML-based approaches to process XML documents and to locate patterns in these documents is also wide. This section first introduces the most important APIs for programmatical access to XML documents. The application of these APIs to the software analysis is only briefly described as they provide no declarative query possibilities. Hence, the focus is set to declarative processing languages for XML. First the languages TQL and XDuce are presented, followed by approaches that make use of the standardised XML processing languages XPath and XQuery for the analysis of software.

\paraskip

The \emph{Document Object Model} (DOM) ``is a platform- and language-neutral interface that [allows] to dynamically access and update the content, structure and style of [XML] documents''~\cite{DOM}. The DOM tree is a representation of the underlying XML document and can be traversed and transformed arbitrarily. Implementations are available for nearly all programming languages, but the drawback of DOM is its high memory requirement as the complete DOM tree must be kept in memory at once.

The opposite approach is the \emph{Simple API for XML} (SAX)~\cite{SAX} which provides a mechanism for read-only access to the data of XML documents. SAX traverses XML documents once from the start to the end and calls user-defined callback methods on occurring events. This event-driven approach makes SAX faster than DOM and features a small memory footprint. On the other hand programming with low-level API of SAX is non-trivial and it is not possible to change the XML document.

The \emph{Streaming API for XML} (StAX)~\cite{StAX} is a Java-only API which ``gives parsing control to the programmer by exposing a simple iterator based API and an underlying stream of events. [It allows a] developer to ask for the next event (pull the event) rather than handling the event in a callback''~\cite{StAX}. This design is a mixture of DOM and SAX as it allows to navigate in the XML document comparable to DOM and features adequate memory consumption and good speed like SAX.

There is no known approach directly deploying one of the APIs for the task of finding patterns in software artefacts, even though their usage is often recommended for this purpose~\cite{antoniol2003xxo, zou2001tpx, alekram2005xbf, badros2000jml}. If these APIs are used, they are wrapped into specialised high-level APIs for software analysis~\cite{maruyama2004ctp}. The situation is summarised by~\cite{maruyama2004ctp}: ``The standard APIs (e.g. DOM and SAX) are of course convenient for writing code, independent to a specific programming language but too primitive for most developers when they build tools in practice.''

\paraskip

Therefore the focus is set to declarative XML processing languages. First, two academic approaches are presented whose main targets and applications are not software analysis, but which provide properties to be used for software analysis tasks.

The \emph{Tree Query Language} (TQL)~\cite{TQLpaper} is a language to query semistructured data like unordered forests with labelled nodes. TQL can also be used to query XML trees and is therefore discussed in this section. Papers regarding TQL and a reference implementation can be found online~\cite{TQL}. TQL does not follow an approach limited to path-based pattern matching, but also incorporates logic-based elements. This makes TQL queries more declarative and less operational than queries in comparable languages like XPath. Listing~\ref{tql_sample} shows an example to clarify this feature. The query returns all elements which are located inside of a \code{book} element (\code{.bib.book.\$elements}) and which contain a \code{firstname} element whose content matches the string \code{Homer} (\code{.firstname[Homer]}). Even though the equivalent XPath expression \brcode{\$bib/book/*[.//firstname[. = "Homer"]]} is a little bit shorter, it lacks of readability compared to the TQL one.

\begin{lstlisting}[style=none,caption=TQL query which combines path-based and logic elements, label=tql_sample]
from $bib |= .bib.book.$elements.firstname[Homer]
select $elements
\end{lstlisting}

The expressive power of TQL is comparable with XPath but lesser then XQuery which is Turing-complete. Nevertheless TQL could be used to detect patterns in XML representations of software artefacts. The crucial reason \emph{not} to use TQL, is its unordered data model---while the data model of XPath respectively XQuery keeps all nodes in document order. This feature is important especially regarding software analysis when expressing queries which compare the location and/or order of software constructs. This limitation is known and future versions of TQL will resolve it~\cite{TQLpaper}. Another limitation of TQL is its missing support for user-defined functions and modules.

% Another attempt to union XML and logic programming is presented in the paper ``Processing XML-Documents in Prolog'' (2002). The queries look much more Prolog-like then the path-based approach of TQL.

While TQL links elements of logic programming with tree data structures like XML, the following approach joins XML and regular expressions. \emph{XDuce} is a statically typed XML processing language with support for regular expression pattern matching~\cite{hosoya2003xst, hosoya2003rep, XDuce}. XDuce is based on so-called regular expression types which correspond to the schema of the XML document. Hence, XDuce can make use of existing schema definitions in the DTD format. It is also possible to define own types like shown in listing~\ref{xduce_sample_1}. The type \code{Addrbook} in line~1 consists of a name, an address, and an optional telephone number which are defined in lines~2--4. Type definitions of XDuce are at about the same level of expressiveness as DTDs but not as powerful as XML schema definitions.

\begin{lstlisting}[style=none,caption=XDuce: definition of a regular expression type, label=xduce_sample_1]
type Addrbook = addrbook[(Name,Addr,Tel?)*]
type Name     = name[String]
type Addr     = addr[String]
type Tel      = tel[String]
\end{lstlisting}

XDuce supports user-definition functions and pattern matching with a single-match semantic. This is demonstrated in listing~\ref{xduce_sample_2} in which the function \code{create-telephone-list} is defined. The function makes use of the types defined in the preceding listing and takes a sequence of one or more \code{Addrbook} items as input and returns a sequence of name and telephone items. Each item of the input sequence is bound to the variable \code{item} and its content is evaluated in lines~2--6. If the \code{Addrbook} type consists of name, address, and the optional telephone number (line~3), the values of the name and telephone fields are returned (line~4). In this case and also in the case that the current address contains no telephone number (lines~5 and~6) the function is called recursively with the residual items as argument. This is the practical consequence of XDuce's single-match semantic: the first matching item is taken and further items are processed recursively.

\begin{lstlisting}[style=none,caption=XDuce function for the transformation of XML data, label=xduce_sample_2]
fun create-telephone-list (val item as Addrbook+) : (Name,Tel)+ =
  match item with
    name[val n], addr[val a], tel[val t], val rest
      -> name[n], tel[t], create-telephone-list(rest)
  | name[val n], addr[val a], val rest
      -> create-telephone-list(rest)
\end{lstlisting}

Even though XDuce supports user-defined functions, what is an advantage compared to TQL, and its query power might be adequate for most software analysis purposes, it lacks all those additional features standardised languages can provide: an active user community, multiple reference implementations, and ongoing development of tools and language concepts.

\paraskip

This is where standardised and declarative XML processing languages, mainly XPath, XQuery, and XSLT, come into the field. The first two of these languages have been introduced in detail in the sections~\ref{xpath} and~\ref{xquery}. As this thesis is focused on software analysis and not on the transformation of software, the presentation of approaches making use of XPath and/or XQuery is preferred over those using the transformation language XSLT. For more considerations on XQuery vs. XSLT regarding software analysis see section~\ref{design_xquery}.

\emph{PMD}~\cite{PMD} is a tool for scanning Java source code for potential problems. It provides a set of predefined rules for the detection of source code anomalies and two ways to define new rules. This can be done by providing new Java classes that traverse the AST or by writing custom rules using XPath~\cite{copeland2005pa}. Listing~\ref{pmd_sample} shows two example XPath expressions that operate on an XML representation of the AST provided by PMD. The pattern in line~1 finds all \code{while} loops that omit the optional braces. This is expressed by searching all \code{WhileStatement} elements whose \code{Statement} element does not contain a \code{Block} element. The expression in line~2 finds occurrences of empty if statements by searching \code{IfStatement} elements without braces and without any contained statements (\code{EmptyStatement}) and by searching empty blocks (\code{Block[count(*) = 0]}) contained by \code{IfStatement} elements.

\begin{lstlisting}[style=XQuery, caption=PMD XPath expressions for the analysis of Java source code, label=pmd_sample]
//WhileStatement[not(Statement/Block)]
//IfStatement/Statement[EmptyStatement or Block[count(*) = 0]]
\end{lstlisting}

PMD currently supports only XPath~1.0 expressions which are less powerful then XPath~2.0 ones. The reason is the XPath engine \emph{jaxen} used by PMD which only supports XPath~1.0. Support for version~2.0 of XPath is not planned.\footnote{\url{http://jaxen.org/faq.html}} PMD is not only restricted to anomalies in Java source code, but the XPath expressions are additionally tightly coupled to the AST of PMD. If the underlying grammar changes, for example due to new features in upcoming Java releases, the XPath expressions must reflect these changes as they would also be visible in the XML representation of the AST.

\abbrev{EJB}{Enterprise JavaBean}

More powerful analyses can be achieved by using XQuery instead of XPath as query language. The framework \emph{XIRC} (XML-based Information Retrieval and Conversion)~\cite{eichberg2004xkc} is used by the open static analysis platform \emph{Magellan}~\cite{Eichberg07} to support XQuery for application-specific analyses. As a case study, the analysis of \emph{Enterprise JavaBean} (EJB) classes is used and XQuery expression are applied for checking structural properties of EJBs. 

According to the EJB specification, all entity bean classes and contained methods must not be declared as final. Listing~\ref{magellan_sample} shows an XQuery expression which find classes and methods that violate this EJB constraint. In the first line all entity bean classes are located using a path expression and bound to the variable \code{\$ebs}. In the second line those classes and methods are returned that are misleadingly declared as final.

\begin{lstlisting}[style=XQuery, caption=XQuery expression searching final EJB classes and methods, label=magellan_sample]
let $ebs := //class[./annotations//@type = "javax.ejb.Entity"]
return $ebs[@final = "true"] union $ebs/method[@final = "true"]
\end{lstlisting}

Even though a central concern of Magellan is the decoupling of analyses as ``the same functionality for parsing and analyzing code is developed over and over again''~\cite{Eichberg07}, this is not done consequently as the XQuery analysis functions are bound to the underlying XML data structure what limits their reusability.

The same critique applies to the framework \emph{XCARE}~\cite{fonseca2004trc} which stands for XML-based Code Analysis and Reverse Engineering. XCARE is also based on XQuery and implements the following analysis operations: metrics, design critiques (a mixture of design defects and bad smells, the term was introduced by \emph{RevJava}\footnote{\url{http://www.serc.nl/people/florijn/work/designchecking/RevJava.htm}}), and reverse engineering operations. Listing~\ref{xcare_sample} shows an XQuery expression to find public Java classes (represented as XML using the JavaML version of Badros~\cite{badros2000jml}) that are lacking a constructor. As seen with PMD and Magellan, the query directly accesses the underlying XML document which leads to the described drawbacks.

\begin{lstlisting}[style=XQuery, caption=XCARE pattern of the design critique \emph{Uninstantiable Public}, label=xcare_sample]
for $class in $document//class
where (($class/@visibility = "public")
  and not(exists($class/constructor)))
return $class/@name
\end{lstlisting}

The \emph{RefaX} refactoring framework~\cite{mendoncca2004bfr} makes use of XQuery to check pre- and post-conditions for refactorings while the actual refactorings are implemented using the XML update language XUpdate~\cite{XUpdate}. Even though RefaX is about software transformations and the detection of bad smells is explicitly not in the scope of RefaX, the design of the framework is also of interest regarding software analysis as it addresses the problems described with PMD, Magellan, and XCARE.

The main point of critique is that most of the available approaches ``rely on their own, closed mechanisms for representing and manipulating source code information, which makes them difficult to customize, extend and reuse''~\cite{mendoncca2004bfr}. Hence, the primary design goals of RefaX are scheme-independence (support different XML data models for the same programming language) and language-independence (refactorings should be applicable to different programming languages). This is achieved by varying levels of abstraction between the XML representation of the programming language and the refactorings respectively the pre- and post-condition checks. Access to the XML data is never performed directly, but each XML data access is encapsulated so that consumer of the data needs no further information regarding the underlying structures. This approach can also be adopted for software analysis.


\section{Other Common Approaches} \label{find_other}

The methods presented above to find patterns in software artefacts cover a wide range of the available approaches. Other classes of common methods are not presented in depth, but mentioned for reasons of completeness. Approximate matching of patterns can be achieved by approaches based on fuzzy logic~\cite{niere:dpr}. The detection of duplicate code (so-called clone detection) is also a common task for software analyses~\cite{Baxter98, kontogiannis1996pmc} which can be combined with fuzzy logic~\cite{weis}. Clone detection can also take information into account contained in repositories of version control systems for source code. This allows for example to detect divergent changes over time~\cite{Saad} and so-called change patterns~\cite{bouktif2006ecp}.

Abstract syntax trees and parse trees are important data structures regarding software analysis as nearly all approaches---also those presented in the preceding sections---require parsing the source code at first. Instead of using abstract syntax trees only as intermediate format to fill XML or fact databases for instance, the syntax tree can be used as primary data structure for the software analysis. Nearly all known approaches for the detection of bad smells are based on the direct use of abstract syntax trees. For instance, the \ttcn{} tool TRex~\cite{TRex} which is used for the integration of the software analysis framework presented in this thesis, already features a pattern detection engine. It is based on traversing the AST and making use of related data structures like the symbol table and reference finder provided by TRex~\cite{Bisanz06}. The \emph{Static Analysis Framework} of the Eclipse \emph{Test \& Performance Tools Platform} (TPTP) Project~\cite{TPTP} on which the pattern detection of TRex is built, also provides a set of~70 predefined code review rules for Java. The technical foundations are very close to the pattern detection of TRex: it makes use of an abstract syntax tree---in this case provided by the Eclipse \emph{Java Development Tools} (JDT)---and detects anomalies in the source code by traversing this AST. Built on the same infrastructure is \emph{CodeNose}~\cite{slinger}, an Eclipse plug-in specialised for the detection of bad smells in Java source code. Additionally to the access to the AST, a fact database is used to detect ``more complex smells like Refused Bequest and Feature Envy''~\cite{slinger}.

\emph{FindBugs} is a tool which uses ``static analysis to look for bugs in Java code''~\cite{FindBugs}. It is based on the concept of bug patterns---code idioms that are often errors---which are described more detailed in section~\ref{patterns} and in the according FindBugs paper~\cite{FindBugs04}. As the tool's name already states, \emph{Checkstyle}~\cite{Checkstyle} is focused on the review of coding standards for Java. Checkstyle is included here, as it is extended to ``find class design problems, duplicate code, or bug patterns''~\cite{Checkstyle}. FindBugs and Checkstyle provide APIs to write new analysis function but only by plugging-in Java classes. While FindBugs and Checkstyle directly work on abstract syntax trees, a metamodel is used to represent the source code for the code smell browser \emph{jCOSMO}~\cite{vanEmden}. It is capable to detect and visualise smells in Java source code but ``it is possible to generalize our approach to other object-oriented languages''~\cite{vanEmden}. 


\section{Summary and Discussion} \label{find_summary}

Table~\ref{approaches} summarises some of the query technologies for the pattern detection presented in the preceding sections. It includes languages specific to software analysis and/or transformation of the sections~\ref{find_re}--\ref{find_dsl} but no general purpose languages like those for XML processing. This is done as the concepts of specialised languages differ significantly from those of general purpose languages. Additionally, general purpose languages feature no specifics regarding pattern detection and software analysis.

The table columns consist of the names of the approach in the first one, the class of the approach in the second column (regular expression, logic programming, or domain-specific language) and a short summary of the provided functionality. The last two columns state if the query language is independent regarding the underlying programming language and which language is the current main analysis target.

\begin{table}[h]
\centering
\label{approaches}
	\begin{tabular}{llp{5.3cm}p{1cm}p{1.2cm}}
	\rowcolor{lightgray}
	\headcolumn{Name} & \headcolumn{Class} & \headcolumn{Summary} & \headcolumn{Lang. Indep.} & \headcolumn{Analysis Target} \\
	\hline \noalign{\vspace{1mm}}
	SCRUPLE & RE & Lexical approach using RE-based patterns to match ASTs & No & C \\
	TAWK & RE & Syntactical approach using syntax patterns to match ASTs & Yes & C \\
	JQuery & Logic & Predicates to query a fact database & Mostly & Java \\
	GenTL & Logic & Predicates and syntax pattern to query a logic representation of ASTs & Mostly & Java \\
	Jackpot & DSL & Meta-variables and Java constructs & No & Java \\
	PQL & DSL & Based on objects and sequences of events & Hardly & Java \\
	SCA & DSL & Algebraic source code model, high-level queries & Yes & C \\
	\hline
	\end{tabular}
\caption{Different approaches for pattern detection in software}
\end{table}

Table~\ref{approaches} points up that the presented analysis languages base on very different approaches and are therefore hardly comparable. Only a small subset of properties like the independence of the analysed programming language or the query power can be compared. But none of these properties can be mapped to a comparable scale and therefore the results would be unsatisfying.

A conclusion that can be drawn is that domain-specific language provide well ease of use as their scope is clearly laid out and the syntax is straightforward. Another remarkable outcome is that most of the query languages specialised to one analysis target, make use of the underlying programming language itself. The Jackpot rule language for example uses Java statements like \code{instanceof} as shown in listing~\ref{jackpot_sample} what limits the language reusability by design. 

Even though only a few approaches address the language-independence regarding the analysis target, most of the approaches provide flexible and powerful query capabilities. Support for user-defined queries without making programmatic use of provided APIs is also not common. Just a few approaches allow on-the-fly integration of queries like those based on XPath or XQuery, logic programming, or domain-specific languages. The experiences and issues of this chapter flow into the requirements and design of the software analysis framework which is described in the remainder of this thesis.


\chapter{Requirements and Design} \label{ch_req_and_design}

This chapter describes the requirements and the design of a software analysis framework. In section~\ref{requirements} the requirements of the framework are presented. The transformation of the requirements into a concrete framework design and architecture is described in sections~\ref{design_architecture}, \ref{design_representation}, and \ref{design_xquery}. Finally, the overall design of the framework is discussed in section~\ref{design_discussion}. The concrete implementation of a software analysis framework that fulfils the requirements and realises the design is presented in chapter~\ref{ch_impl}.


\section{Requirements for a Software Analysis Framework} \label{requirements}

Before going into the details of the requirements for a software analysis framework, definitions of the terms \emph{software}, \emph{analysis}, and \emph{framework} are given and it is clarified how they apply in the context of this thesis.

According to the ISTQB, \emph{software} is defined as ``computer programs, procedures, and possibly associated documentation and data pertaining to the operation of a computer system''~\cite{istqb}. Regarding this thesis, the term software is used for software artefacts to be analysed. These software artefacts can represent source code or software on an abstract level as described in section~\ref{sw_representations}.

\emph{Analysis} in this context is the generic term for \emph{dynamic analysis} and \emph{static analysis} of software. The first is ``the process of evaluating behavior, e.g. memory performance, CPU usage, of a system or component \emph{during execution}'' while the latter is an ``analysis of software artifacts, e.g. requirements or code, carried out \emph{without execution} of these software artifacts''~\cite{istqb}. Another difference between both kinds of analyses is that the information obtained from a static analysis is valid for all possible executions of the software while the result of a dynamic analysis is typically valid for the run in question~\cite{jackson2000sar}.

\emph{Frameworks} provide conceptual structures to solve complex problems. Software frameworks can be defined as a ``set of cooperating classes that makes up a reusable design for a specific class of software. A framework provides architectural guidance by partitioning the design into abstract classes and defining their responsibilities and collaborations''~\cite{GammaEtAl}. Thus, frameworks provide reusable and extensible structures for application-specific problems. Customising frameworks is mostly done by inheriting from predefined structures or by writing plug-ins for extension points defined by the framework. Even though frameworks often make use of design patterns both concepts must be differentiated. While design patterns provide abstract solutions for common problems, frameworks provide concrete extensible implementations that address domain-specific problems.

Combining these three definitions results in a definition of a software analysis framework: it is a tool providing an infrastructure for the static and/or dynamic analysis of software artefacts. It can include predefined analysis plug-ins and adoptions for specific software artefacts. It allows to integrate new analysis plug-ins as well as adoptions for new software artefacts. As the focus of this thesis is the static analysis, the main task of a software analysis framework is to carry out static code analysis. ``The tool checks source code, for certain properties such as conformance to coding standards, quality metrics or data flow anomalies''~\cite{istqb}.

% ISTQB, dynamic analysis tool: A tool that provides run-time information on the state of the software code. These tools are most commonly used to identify unassigned pointers, check pointer arithmetic and to monitor the allocation, use and de-allocation of memory and to flag memory leaks.

\label{requirements_general}
The following enumeration lists general requirements for a software analysis framework, sorted by relevance, and tagged with corresponding short names (like R5 for the fifth requirement) to be able to refer to each requirement:

{ % start custom enumerate style
\renewcommand{\labelenumi}{ R\arabic{enumi} \label{R\arabic{enumi}} }
\begin{enumerate}
\item The framework must provide a way to describe patterns that occur in software artefacts (such as design patterns, bad smells, and metrics as described in section~\ref{patterns}).
\item The patterns must be describable in a generic way on a high level of abstraction.
\item The language which is used for the description of the patterns must assure well comprehensibility of the patterns. Functional and declarative languages should be favoured over procedural ones.
\item The description of patterns must be independent of specific analysis targets (for example a concrete programming language like Java or~C) and specific representations of the analysis targets (for example a concrete representation format like an abstract syntax tree).
\item The description of patterns must be human readable and directly executable by the framework at the same time.
\item The framework must be able to execute the patterns to detect instances of these patterns in software artefacts. (Such an instance is called a match.)
\item The framework must be capable to detect patterns in result of static analyses. It should also be possible to detect patterns in result of dynamic analyses.
\item The pattern detection must work accurate. It should generate as few as possible false positive and false negative matches.
\item The framework must be extensible regarding new types of patterns and also regarding new analysis targets (like new kinds of software artefacts or representation formats).
\item It must be possible to add user-defined patterns to the framework to enrich the set of predefined patterns. Writing user-defined patterns should be possible without profound knowledge of the underlying representation format.
\end{enumerate}
} % end custom enumerate style

Some requirements are not explicitly mentioned in this list as they arise from the presented requirements. For example the need to perform the analysis in an acceptable space of time to encourage developers to make use of it. Other requirements are the possibility to group related patterns and to execute a subset of the available patterns and/or plug-ins instead of executing all patterns.

The requirements R\ref{R1}--R\ref{R10} concern the framework core of describing and executing patterns and ways of extension. Requirements regarding user interfaces (UI) and usability are \emph{not} in the scope of the requirements R\ref{R1}--R\ref{R10}. Instead, the application embedding the framework should provide an appropriate user interface which permits at least to select patterns to be executed and to display the result of an analysis in a suitable, graphical way.

\paraskip
\label{requirements_rformat}

The result of an analysis is composed of zero or more matches. Matches are instances of patterns detected by the analysis in the representation format. The result of the analysis should be presented by the embedding application in the analysis target and not in its representation format. Hence, representation formats needs to fulfil the following requirements to allow the mapping of matches from the representation format to the underlying analysis target where they originate from. The following requirements for representation formats assume that the analysis target is stored in one or more files whereat each file contains one or more lines.

% TODO: is this pagebreak still ok here?
\pagebreak

For matches that encompass \emph{at most one line}, representation formats must contain the following information for each differentiable entity (such as variable names or statements) of the analysis target:
\begin{itemize}
\item minimal set of information: filename and line number
\item optimal set of information: filename, line number, start and end\footnote{Equivalent to the \emph{start and end} information is the \emph{start and length} information.} offset (based on the start of the current line)
\end{itemize}

For matches that encompass \emph{multiple lines}, representation formats must contain the following information for each differentiable entity of the analysis target:
\begin{itemize}
\item minimal set of information: filename and start and end line numbers or alternatively: filename and start and end offsets (based on the start of the current file)
\item optimal set of information: filename, start and end line numbers, and start and end offsets (based on the start of the current file)
\end{itemize}

\label{requirements_ttcn3}
The subtitle of this thesis states that the software analysis is applied to detect bad smells in \ttcn{} test suites (see section~\ref{ttcn3core} for details on \ttcn). This leads to the following list of requirements for a software analysis framework regarding \ttcn{} (sorted by relevance):

{ % start custom enumerate style
\renewcommand{\labelenumi}{ T\arabic{enumi} \label{T\arabic{enumi}} }
\begin{enumerate}
\item The framework must be customised for the static analysis of \ttcn{} source code.
\item More specific, it must be adopted for the detection of bad smells in \ttcn{} source code.
\item It must be capable to detect those bad smells described in the \ttcn{} code smell catalogue (see section~\ref{ttcn3smell} for details) that can be discovered in result of static analyses.
\item The software analysis framework must be integrated into the \ttcn{} tool TRex~\cite{TRex}.
\item It should be possible to customise the framework for the detection of test purposes in \ttcn{} source code or in behavioural representations of \ttcn{} source code. See section~\ref{new_patterns} for details regarding this.
\end{enumerate}
} % end custom enumerate style

Requirement R\ref{R10} demands the possibility to add new user-defined patterns to the framework and consists of a number of subordinate requirements. They do not directly concern the framework core itself but the user interface of the embedding application. As the framework must be integrated into the \ttcn{} tool TRex (according to requirement T\ref{T4}) the following requirements regarding user-defined patterns must be considered.

{ % start custom enumerate style
\renewcommand{\labelenumi}{ U\arabic{enumi} \label{U\arabic{enumi}} }
\begin{enumerate}
\item The application embedding the software analysis framework must allow to enter and execute user-defined patterns. Errors (like syntax errors) must be handled.
\item It must present the result of the pattern detection in an appropriate way.
\item It must be able to store user-defined patterns including pattern names.
\item It must allow to select a subset of user-defined and predefined patterns for analysis runs.
\item It should be possible to view, edit, delete, and rename stored user-defined patterns.
\end{enumerate}
} % end custom enumerate style

While this section lists abstract requirements for a software analysis framework and its customisation towards \ttcn, the following sections transfer these requirements into a concrete design for the software analysis framework.


\section{Design: A Layered and Extensible Architecture} \label{design_architecture}

This section presents the global architecture of the software analysis framework and addresses the requirements R\ref{R4}, R\ref{R7}, and R\ref{R9}. The most important requirement of these is requirement R\ref{R4} which demands patterns to be independent of the analysis targets and their representation formats. Requirements R\ref{R7} and R\ref{R9} mainly demand an infrastructure which supports adding new analysis targets and new types of patterns.

The overall design of the framework is described by differentiating between the vertical and horizontal design. The vertical design makes use of the \emph{facade} design pattern which provides a ``unified interface to a set of interfaces in a subsystem. [The] facade defines a higher-level interface that makes the subsystem easier to use''~\cite{GammaEtAl}. A positive side effect of this design pattern is a loose coupling between the layers below the facade layer and upper layers using the facade.

\input{pstricks/ch4_design_1}

Figure~\ref{design_1} gives an overview of the vertical, layered design of the framework. The boxes on the left side use abstract terms while the right side shows a concrete instance of the framework customised for the detection of bad smells in \ttcn{} source code. The first layer consists of an analysis target, \ttcn{} source code in this example. The second layer is a representation of the analysis target that acts as the primary data structure for the analysis.\footnote{The representation format and the underlying analysis target might also be the same. For example, when analysing source code and using plain source code as representation format.} The exemplified instance on the right side uses an abstract syntax tree for the representation of \ttcn{} source code. The third layer is an instance of the facade design pattern which is used for a unified access to the representation format. Towards the fourth layer the facade layer provides stable interfaces to ensure the reusability of plug-ins. Towards the layer of the representation format the facade layer must be customised for each format to actually be able to access it. The fourth layer consists of plug-ins that perform analyses. Plug-ins make use of the facade layer to access the analysis target. This layered architecture permits plug-ins to depend only on the facade layer and to be independent of the analysis target and its representation---and therewith fulfil requirement R\ref{R4}.

Plug-ins consist of a set of analysis functions and facade layers consists of a set of stable interfaces---respectively a set of functions implementing these interfaces. Both, the plug-in and the facade layer, can be topically grouped to improve the overview and reusability. Given a topical intersection of different analysis targets (for instance commonness between different programming languages), these similarities can be reflected by these layers. The smell detection plug-in for example can be grouped into generic smell detection functions and specific smell detection ones (for instance groups of functions for \ttcn{} specific and UML specific smell detection). This can also be done for the facade layer by grouping the interfaces into general interfaces and interfaces for specific analysis targets.

Note that in figures (like figure~\ref{design_1} and~\ref{design_2}) regarding the framework architecture the arrow type \hspace{5mm} \psline[arrows=o->](-0.6,0.1)(-0.1,0.1) is used for any kinds of access and the arrow type \hspace{5mm} \psline[arrows=*->](-0.6,0.08)(-0.1,0.08) for conversions (for instance conversions of software artefacts to representation formats).

\input{pstricks/ch4_design_2}

The horizontal design of the software analysis framework is shown in figure~\ref{design_2}. Extending the framework is possible by adding plug-ins such as the metrics and smell detection plug-ins in this example. As plug-ins are only bound to the stable facade interfaces, all plug-ins can be used for the analysis of all targets. Customising the framework is possible by integrating new representation formats respectively analysis targets (UML models and \ttcn{} source code in this example). This requires a customisation of the facade layer towards the representation layer to adapt the new representation format. These properties of the framework fulfil requirements R\ref{R7} and R\ref{R9}.

The core of the software analysis framework is the application of the facade design pattern to create an abstraction layer between the representation layer and the analysis layer. Plug-ins are only coupled with the facade layer what results in a decoupling of the plug-ins from the underlying representation formats respectively analysis targets. If the analysis target and/or its representation format changes or new targets/representations are added, only an adoption of the facade layer is required---the stable interfaces of the facade layer and even more important the plug-ins remain untouched. 


\section{XML for the Representation of Software Artefacts} \label{design_representation}

As discussed in section~\ref{sw_rep_xml} the Extensible Markup Language (XML) can serve as representation format for different kinds of software artefacts. This includes abstract high-level software representations, fully detailed source code representations and even ones for data and control flow graphs. ``This makes XML [\ldots]~a natural choice to be used as [\ldots]~representation format for program representations''~\cite{alekram2005xbf}.

The software analysis framework also builds upon XML as a universal representation format. Reasons for this decision are the availability of many XML-based software representation formats and XML-aware languages like XQuery and XSLT~\cite{mendoncca2004bfr}. Additionally, it is possible to transform nearly every data structure into XML without major effort. Examples regarding those transformations are given in section~\ref{sw_rep_xml} by the transformation of ASTs, models, and graphs to XML.

XML also fulfils the requirements to serve as representation format for a software analysis framework (see section~\ref{requirements_rformat}), as all information required for the mapping of matches from the representation format to the analysis target can be stored using XML elements or attributes---as long as these information are provided by the underlying representation format respectively analysis target.


\section{XQuery for the Facade Layer and Pattern Description} \label{design_xquery}

This section presents a concretion regarding the design of the software analysis framework concerning the plug-in and facade layers. In the first step the requirements for both layers are summarised, followed by a discussion of technical realisation possibilities, and a concrete recommendation for the implementation.

Concerning the plug-in layer, the main focus is to fulfil the requirements regarding the description of the patterns (R\ref{R1}--R\ref{R3}, R\ref{R5}, R\ref{R6}, R\ref{R8}, and R\ref{R10}). The core requirements for the pattern description (R\ref{R1}--R\ref{R3}) mainly demand the possibility to describe patterns in a general, abstract, and comprehensive way. According to requirement R\ref{R3}, this could be achieved by using functional and declarative languages for the pattern description. Additionally, the patterns should be human readable on the one hand and directly executable by the framework on the other hand (R\ref{R5}). By executing the patterns, the framework must be able to detect instances of these patterns in the representation format of the analysis target (R\ref{R6}).

The requirements for the facade layer arise from the global architecture of the framework (section~\ref{design_architecture}) and the design decision to use XML as representation format for the framework (section~\ref{design_representation}). Hence, the facade layer must fulfil two major requirements: it must provide stable interfaces towards the plug-in layer and it must be able to access the XML of the representation layer. The facade layer must be able to handle XML and therefore the usage of an XML-aware language is self-evident.

The most well-known, widely implemented, and standardised XML-aware programming languages are XSLT~2.0 and XQuery~1.0. The latter is described in detail in section~\ref{xquery} and its application regarding software analysis is discussed in chapter~\ref{ch_relatedwork}. XQuery is focused on querying XML while XSLT's domain is the transformation of XML. Both languages make use of XPath~2.0 but their concepts of doing this differ significantly. While XQuery extends the non-XML syntax of XPath with powerful constructs, XSLT uses an own XML-based syntax and incorporates XPath expressions. XSLT and XQuery are functional and declarative programming languages and support the concept of user-defined functions and modules which allows to group related functions into independent packages.

To demonstrate the different approaches and syntaxes of XSLT and XQuery an example is given. The goal is to transform the Simpsons example XML document of listing~\ref{listing_xml_simpsons} from section~\ref{xml} to the output shown in listing~\ref{design_xml}---once using XSLT, once using XQuery.

\begin{lstlisting}[style=XML, caption=Simpsons episodes including episode and season numbers, label=design_xml]
<episode number="1" season="1"/>
<episode number="2" season="1"/>
<episode number="401" season="19"/>
<episode number="402" season="19"/>
\end{lstlisting}

Listing~\ref{design_xslt} shows the XSLT stylesheet for the desired transformation of the Simpsons example. For each \code{episode} element which is matched in line~1, a new \code{episode} element is created in line~2. The attributes are added in line~3--8 and the according episode and season numbers are selected in lines~4 and~7 using incorporated XPath expressions. The used XSLT syntax could be abbreviated, but to demonstrate the different syntaxes of XSLT and XQuery this elaborate syntax is chosen.

\begin{lstlisting}[style=XML, caption=Using XSLT to create the XML output of listing~\ref{design_xml}, label=design_xslt]
<xsl:template match="//episode">
	<episode>
		<xsl:attribute name="number">
			<xsl:value-of select="@number"/>
		</xsl:attribute>
		<xsl:attribute name="season">
			<xsl:value-of select="../@number"/>
		</xsl:attribute>
	</episode>
</xsl:template>
\end{lstlisting}

Listing~\ref{design_xq} shows the equivalent XQuery expression which needs no further explanations as the language was already introduced in section~\ref{xquery}. The usage of a non-XML syntax eases the readability and also the effort to write new queries.

\begin{lstlisting}[style=XQuery, caption=Using XQuery to create the XML output of listing~\ref{design_xml}, label=design_xq]
for $episode in $xml//episode
return element episode { 
	attribute number { $episode/@number    },
	attribute season { $episode/../@number }
}
\end{lstlisting}

Although it would be possible to combine XSLT and XQuery, the usage of one language for the facade and plug-in layer is favoured for reasons of simplicity. The functional capabilities of both languages are at the same level. But as software analysis is more about querying than about transforming and as XQuery's non-XML syntax is better human readable than the XML syntax of XSLT, XQuery is the preferred language for the implementation of the facade and plug-in layers.

XQuery functions (respectively the function's signatures) serve as stable interfaces of the facade layer and the function's bodies perform the access to the XML of representation layer at the same time. All functions that belong to a concrete facade (such as a facade for the access to a concrete representation format) are grouped into one XQuery module. Listing~\ref{facade_xquery_1} shows an example of a complete XQuery module providing the desired functionality of the facade layer. In the first line the namespace prefix \code{example-facade} and the according namespace are declared. Line~2 contains a declaration of an \code{external} XQuery variable which means that the content of this variable is provided by the XQuery processor prior to the execution of the query. The variable \code{\$example-facade:root} contains the input XML data in the chosen representation format. In lines~4--6 the function \code{example-facade:get-classes()} is declared and covers two different purposes. First, the signature of the function serves as a stable interface. Hence, calling this facade function always returns all class definitions regardless of the underlying representation format. The second requirement is the actual access to the XML of the representation format. XQuery functions fulfilling both requirements are called \emph{facade functions}. In this example, class definitions can be accessed through the module variable \code{\$example-facade:root} and XML elements named \code{ClassDef} in line~5. Adopting the facade layer for a new representation format would only require renaming this XML element name.

\begin{lstlisting}[style=XQuery, caption=Example XQuery module of the facade layer, label=facade_xquery_1]
module namespace example-facade = "unique-facade-namespace";
declare variable $example-facade:root external;

declare function example-facade:get-classes() {
	$example-facade:root//ClassDef
};
\end{lstlisting}

XQuery modules are also used for the plug-in layer as containers for related analysis functions. These analysis functions are XQuery functions describing patterns in software artefacts. Listing~\ref{plugin_xquery_1} contains a sample module of the plug-in layer with one analysis function. Line~1 of listing~\ref{plugin_xquery_1} contains the standard header of every XQuery module: the declaration of the namespace and its prefix. In lines~3 and~4 the facade module of listing~\ref{facade_xquery_1} is imported into the current module to be able to access the facade layer functions. Lines~6--9 contain the declaration of an exemplary analysis function which returns the number of functions for each class. The access to the analysis target respectively its representation format is fully channelled through the interfaces of the facade layer. 

\begin{lstlisting}[style=XQuery, caption=Example XQuery module of the plug-in layer, label=plugin_xquery_1]
module namespace example-plugin = "unique-plugin-namespace";

import module namespace example-facade = 
	"unique-facade-namespace" at "facade.example.xquery";

declare function example-plugin:functions-per-class() {
	for $class in example-facade:get-classes()
	return count(example-facade:get-functions($class))
};
\end{lstlisting}

The requirements R\ref{R1} and R\ref{R2} demand the possibility to describe patterns in a generic way on a high level of abstraction. The usage of XQuery for the pattern description fulfils these requirements as the Turing complete~\cite{kepser2004spt} language allows to express all kinds of patterns. A high level of abstraction is achieved through the facade layer which also permits the patterns to be as generic as possible by being independent of the underlying analysis target. XQuery is a functional and declarative programming language which improves the comprehensibility of the patterns and fulfils requirement~R\ref{R3}. The syntax of XQuery is human readable on the one hand and also directly executable by an XQuery processor on the other hand---as demanded by requirement R\ref{R5}. The detection of pattern instances is done by executing XQuery functions which describe patterns. Hence, using XQuery for the pattern description also fulfils requirement R\ref{R6}.

The requirements that the detection must work accurate (R\ref{R8}) and the possibility to add user-defined patterns to a set of predefined patterns (R\ref{R10}) is also addressed. The latter can be achieved by an XQuery module for user-defined patterns and the possibility to add functions to existing modules. The accuracy of the pattern detection is mainly up to the author's care when creating new patterns.

After having solved the main technical issues by deciding to use XQuery and the introduction of XQuery-related vocabularies, a few words about the terminology are required. The term \emph{pattern} is used as an abstract term for patterns in software artefacts referring to the requirements and the design of the analysis framework. In contrast, the term \emph{function} stands for concrete XQuery functions describing such patterns. The same relation holds for the terms \emph{plug-in} and \emph{module}. While the first is an abstract term used mainly for the design and requirements, the term \emph{module} stands for XQuery modules used for the facade layer and plug-in layer. The term \emph{query} describes any XQuery expression that can be executed.

\label{design_hierarchy}

Plug-ins realised as XQuery modules consist of XQuery analysis functions. They appear in modules as flat and unstructured lists of functions. The user of a module could manually look into it and select the required functions. However, that would not work for a program as it cannot recognise the semantics of the functions. But as frameworks should smoothly interact with other programs, a way must be provided to describe the structure of modules.

This is done by an XML document for each XQuery plug-in module. The XML document represents each analysis function of the module and the hierarchical structure of the module. The structure of the XML document itself is defined by an XML schema (\code{hierarchy.xsd}). An exemplary XML document conforming to the \code{hierarchy.xsd} schema is presented in listing~\ref{hierarchy_example_1}. The root element \code{hierarchy} (lines~1--6) features the attribute \code{filename} to find the according XQuery module file. The attributes \code{namespace-prefix} and \code{namespace} allow to access the module with correct namespace information. The hierarchy itself is made up of arbitrarily nested \code{category} elements (lines~7 and~10) which contain \code{function} elements (lines~8 and~11) for each XQuery analysis function. Each \code{function} element features the attributes \code{name} used for a description and \code{xquery-name} which is the function name used in the XQuery module. The \code{function} elements can contain \code{parameter} elements (line~12) whereat each one represents one XQuery parameter of the analysis function.

\begin{lstlisting}[style=XML, caption=Describing the structure of plug-ins using \code{hierarchy.xsd}, label=hierarchy_example_1]
<hierarchy
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:noNamespaceSchemaLocation="hierarchy.xsd" 
	filename="plug-in.example.xquery"
	namespace-prefix="example-plugin"
	namespace="unique-plug-in-namespace">
	<category name="Root Category">
		<function name="Number of Functions per Class" 
			xquery-name="functions-per-class"/>
		<category name="Nested Category">
			<function name="Another Analysis Function" xquery-name="function">
				<parameter name="Limit" xquery-name="limit" type="integer"/>
			</function>
		</category>
	</category>
</hierarchy>
\end{lstlisting}

With the information provided by a hierarchy XML document it is possible to access all functions of the described module. An application can parse the hierarchy XML file and extract all required information to access the according XQuery module including its structure and the analysis functions provided by the module including their parameters and parameter types. The \code{hierarchy.xsd} schema is not bound to the software analysis focus of the framework and can be used to describe the structure of any kinds of plug-ins. The software analysis tool PMD~\cite{PMD} presented in section~\ref{find_xml} uses a similar approach to manage its rules: XML files\footnote{An example XML file of PMD can be found in this article: \url{http://www.onjava.com/pub/a/onjava/2003/04/09/pmd_rules.html}} contain related rules including descriptions, examples, and XPath expressions. This strategy permits a completely declarative way to extend the frameworks with new rules.


\paraskip
\label{design_matches}

As mentioned in section~\ref{requirements} the result of an analysis consists of matches whereat a match is an instance of a detected pattern. Analysis plug-ins return matches in order to allow the embedding application to display the matches in the underlying analysis target. Each analysis function is an XQuery function which returns XML nodes. These nodes constitute matches and reflect instances of patterns in terms of selections of the representation format (subtrees of the XML input document). Instead of returning complete matches only the information required for locating the matches in the analysis target needs to be returned. To enhance the reusability of the software analysis framework the way of returning matches is standardised by an XML Schema definition (\code{matches.xsd}). This schema reflects the requirements any representation formats needs to fulfil for the mapping of matches to the analysis target (these requirements are defined in section~\ref{requirements}).

Instead of presenting the schema itself in detail, listing~\ref{match_example_1} shows an instance of an XML document which conforms to the schema. The root element (line~1) is called \code{matches} and holds an arbitrary number of \code{match} elements. In this example two \code{match} elements are part of the XML document (lines~4--7), each featuring a set of attributes allowing to locate match in the analysis target. These are the attributes \code{filename}, \code{offset}, and \code{offset-end}. Optionally the schema allows the additional attributes \code{line} and \code{line-end} to specify additional location information. The attribute \code{found-by} of a \code{match} element permits to recognise which match originates from which analysis function. For a unique mapping of the matches to the analysis functions the value of the \code{found-by} attribute should be the namespace prefix followed by the function's name (for example \code{example-plugin:functions-per-class}).

\begin{lstlisting}[style=XML, caption=The schema \code{match.xsd} defines how matches are returned, label=match_example_1]
<matches 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
	xsi:noNamespaceSchemaLocation="matches.xsd">
	<match filename="foo.c" offset="17" offset-end="33" found-by="pattern1"/>
	<match filename="bar.c" offset="44" offset-end="68" found-by="pattern2">
		<caused-by offset="23" offset-end="27"/>
	</match>
</matches>
\end{lstlisting}

Each \code{match} element may have a child element \code{caused-by} to refer to a place in the analysis target which caused the current match. An example is shown in line~6 in listing~\ref{match_example_1}. The usefulness of the caused by information can be explained with the bad smell \emph{Unreachable Default} (see section~\ref{smell_unreachable_default}). The actual smell is a default which is unreachable due to an else branch of an alt construct. This else branch can be reported as the place that caused the smell. It can be useful to always report such places when the actual smell and its root cause are different. Each \code{match} element may feature an optional attribute \code{return-value} which is not shown in the example and allows to pass additional information from the analysis function to the subsequent application.

The detection of duplicates is a common task for any software analysis framework. Hence, this is reflected by the schema as it allows \code{duplicates} elements which contain two or more \code{match} elements representing the duplicates. See listing~\ref{match_example_2} for an example of three duplicates.

\begin{lstlisting}[style=XML, caption=Returning duplicate matches conforming to \code{match.xsd}, label=match_example_2]
<matches 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:noNamespaceSchemaLocation="matches.xsd">
	<duplicates>
		<match filename="foo" offset="11" offset-end="17" found-by="pattern1"/>
		<match filename="foo" offset="23" offset-end="29" found-by="pattern1"/>
		<match filename="foo" offset="80" offset-end="86" found-by="pattern1"/>
	</duplicates>
</matches>
\end{lstlisting}

The \code{matches.xsd} schema reflects requirements for analysing software but is kept as general as possible and could be used by other plug-ins. As this schema is only bound to the plug-ins using it and not to the framework itself, other plug-ins might use own schemata to define their ways of returning matches.


\section{Discussion of this Approach} \label{design_discussion}

As seen in the previous sections the general requirements R\ref{R1}--R\ref{R10} are covered by the design and architecture of the software analysis framework. The requirements T\ref{T1}--T\ref{T5} regarding \ttcn{} and those concerning user-defined pattern (U\ref{U1}--U\ref{U5}) are not yet discussed as they must be addressed by the application embedding the framework. Therefore these requirements are picked up again in the discussion of the framework implementation in section~\ref{impl_discussion}.

Reasoning about the pros and cons of the design and architecture of the software analysis framework reveals one main point of critique: too few specialisation respectively too much generalisation due to the usage of common technologies like XML and XQuery. Languages specialised for software analysis as presented in chapter~\ref{ch_relatedwork} can be more powerful and intuitive to use. But the high degree of specialisation is also a drawback of these languages at the same time as they are tightly coupled with their underlying data structures and analysis targets. Exactly this is addressed by the presented framework architecture which permits a decoupling of analysis targets from the actual analyses. This allows to write analyses on a high level of abstraction what improves the reusability of analyses and permits eminent extensibility of the framework. Additionally, the readability of the patterns benefits from the abstraction.

The usage of XML as representation format permits to analysis any kinds of data as it is the ``lingua franca'' for information markup and exchange. But XML tends to bloat up the encoded information by being verbose (for instance element names are repeated in the closing tag) and this can slow down the analysis process. As a counter strategy, the storing of XML information in an XML-aware database can be considered. But the utilisability of this approach depends on the concrete analysis target. If the content of the analysis target changes often, the additional effort to keep an XML database synchronous with the analysis target might be considered as too high.



\chapter{Implementation and Test} \label{ch_impl}

\abbrev{XAF}{XQuery-based Analysis Framework [for Software]}

The concrete implementation of a software analysis framework discussed in this chapter is called \emph{XAF} which stands for ``\underline{X}Query-based \underline{A}nalysis \underline{F}ramework [for Software]''. To point out that the design of the framework and central parts of the implementation are not limited to the analysis of software, the appendix \emph{for Software} is written in squared braces. The concrete implementation is focused on the detection of bad smells in \ttcn{} test suites and is therefore named \emph{XAF for \ttcn}. For reasons of simplicity the implementation is referred to as \emph{XAF} instead of its full name.

This chapter is split up in the following sections: in section~\ref{impl_software} the software technologies used for the implementation of XAF are introduced, section~\ref{impl_java_xquery} describes possibilities of interaction between Java and XQuery also used for the implementation, section~\ref{impl_ttcn3_xml} covers the concept and implementation of the conversion of \ttcn{} source code to XML. The main focus of this chapter are sections~\ref{impl_facade} and~\ref{impl_xquery_smells} which cover the usage of XQuery for the \ttcn{} facade layer and XQuery for the pattern description of bad smells. The integration of the framework into the \ttcn{} tool TRex is presented in section~\ref{impl_trex}, followed by the implementation of user-defined queries and according use cases in section~\ref{impl_udq}. Section~\ref{impl_tests} covers testing XAF and a discussion of the implementation is given in section~\ref{impl_discussion}.

The source code of XAF can be found in the Subversion repository of TRex which is located at \url{http://www.trex.informatik.uni-goettingen.de/svn/trex/}.

\section{Underlying Software Technologies} \label{impl_software}

\abbrev{TRex}{\ttcn{} Refactoring and Metrics Tool}
\abbrev{IDE}{Integrated Development Environment}
\abbrev{ANTLR}{ANother Tool for Language Recognition}
\abbrev{TPTP}{Test \& Performance Tools Platform}
\abbrev{EPL}{Eclipse Public License}
\abbrev{OSI}{Open Source Initiative}
\abbrev{MPL}{Mozilla Public License}
\abbrev{LGPL}{GNU Lesser General Public License}

This section gives an overview of the software used for the implementation of the XQuery-based Analysis Framework for \ttcn. According to requirement~T\ref{T4} the framework should be integrated into the \ttcn{} refactoring and metrics tool \emph{TRex}~\cite{TRex}. TRex was initially developed as a part of the master's thesis of Zeiss~\cite{Zei06} and provides ``IDE functionality for the \ttcn{} core notation, and supports the assessment and automatic restructuring of \ttcn{} test suites by providing suitable metrics and refactorings''\footnote{\url{http://www.trex.informatik.uni-goettingen.de}}. TRex is built up on the Eclipse platform~\cite{Eclipse} and makes use of foundations provided by the bachelor's thesis of Kemnade~\cite{bsc-kemnade}. XAF is also integrated into TRex respectively the Java-based Eclipse platform. The Eclipse way to extend and integrate is to create new plug-ins. Extension points define where and how plug-ins can extend existing functionality and add new features. Therefore XAF plug-ins for TRex are provided to couple the framework with TRex. For the implementation of XAF plug-ins and the integration into TRex see section~\ref{impl_trex}.

\emph{ANTLR} (ANother Tool for Language Recognition) ``is a language tool that provides a framework for constructing recognizers, interpreters, compilers, and translators from grammatical descriptions''~\cite{ANTLR}. The grammatical structure of the \ttcn{} core notation (provided by the \ttcn{} specification~\cite{t3standard1}) was translated to the ANTLR grammar syntax. This allows TRex to make use of ANTLR~2.7 to parse the \ttcn{} source code and to transform it to an abstract syntax tree. For details on the usage of ANTLR regarding \ttcn{} and its TRex integration see the master's thesis of Zeiss~\cite{Zei06}.

TRex features not only plug-ins for refactoring and metrics but also for pattern-based smell detection in \ttcn{} test suites. The pattern-based smell detection plug-ins as well as the \ttcn{} code smell catalogue (see section~\ref{ttcn3smell}) originate from the master's thesis of Bisanz~\cite{Bisanz06}. The pattern detection plug-ins make use of the \emph{Static Analysis Framework} provided by the Eclipse \emph{Test \& Performance Tools Platform} (TPTP) Project~\cite{TPTP}. To make use of the static analysis framework an application needs to extend predefined abstract classes for analysis providers, categories, rules, and analysis results. Providers are the topmost of these classes. They contain all categories and rules and control the execution of analysis runs. Categories are used to structure analysis rules and can include arbitrary subcategories. Rules perform the actual analyses and can be parametrised using rule parameters. Result classes prepare the analysis results in a way the framework can display them using the predefined user interface (UI). The user interface of the static analysis framework is not only responsible for the graphical presentation of analysis results but also for the selection of categories and rules for analysis runs. For details on the usage and structure of the TPTP static analysis framework see~\cite{Bisanz06, TPTPTutorial1, TPTPTutorial2, TPTPTutorial3}. The framework is also used for the integration of XAF into TRex and acts as a link between both components. The latest available version~4.4 of TPTP which requires Eclipse version~3.3 is used for the implementation of XAF. This is no additional restriction, as the upcoming release~0.6 of TRex requires Eclipse~3.3 anyway.

The XQuery processor is a central part of the XQuery-based analysis framework as it is responsible for the execution of the queries and the detection of the patterns. The most important requirements regarding the processor are a complete and correct implementation of the XQuery~1.0 standard and a good performance. \emph{Saxon}~\cite{Saxon} is an XSLT and XQuery processor developed by Michael Kay which fulfils these requirements as Saxon is the only processor that reached 100\%~pass rates\footnote{\url{http://www.w3.org/XML/Query/test-suite/XQTSReport.html}} against the \emph{XQuery Test Suite}\footnote{\url{http://www.w3.org/XML/Query/test-suite/}} provided by the W3C. These results are due to the fact that the author of Saxon is the editor of the XSLT~2.0 standard~\cite{XSLT2.0} and one of the editors of the XPath~2.0 standard~\cite{XPath2.0}. Hence, Saxon is something like a reference implementation of an XSLT and XQuery processor.

Saxon is available in different versions: \mbox{Saxon-B} (for \emph{b}asic) is the less powerful but freely available open-source version, \mbox{Saxon-SA} (for \emph{s}chema-\emph{a}ware) is a commercial product built on the same source code basis as \mbox{Saxon-B}. Both products are available for Java and Microsoft .NET platforms. \mbox{Saxon-SA} provides advanced features like schema-awareness (allows to import an XML Schema and to validate input and output trees) and advanced optimisers which recognise joins in XPath expressions and XQuery FLWOR expressions. Starting with version~8.9.0.4 \mbox{Saxon-SA} provides the ability to translate XQuery expressions ``directly into Java source code, reducing execution time by anything from~25\% to~80\%.''\footnote{\url{http://www.saxonica.com}} The superior performance of Saxon-SA is also backed by third-party performance tests~\cite{Eichberg07}.

XAF utilises the Java edition of \mbox{Saxon-B} (version~8.9.0.4) as Eclipse and TRex are also based on Java. To be able to use Saxon from inside of TRex is must be packaged as an Eclipse plug-in. Hence, the plug-in \brcode{de.ugoe.cs.swe.saxon} contains all Saxon Java archive (JAR) files and exports Saxon's packages which are located in the namespaces \brcode{net.sf.saxon.*} to allow other plug-ins to makes use of Saxon.

All underlying software technologies used for the implementation of XAF are open-source software: Eclipse, TPTP, and TRex are available under the terms of the Eclipse Public License~1.0 (EPL)\footnote{\url{http://www.opensource.org/licenses/eclipse-1.0.php}} while \mbox{Saxon-B} is licensed under the terms of the Mozilla Public License~1.0 (MPL)\footnote{\url{http://www.opensource.org/licenses/mozilla1.0.php}}. Both licenses follow the Open Source Initiative (OSI)\footnote{\url{http://www.opensource.org}} definition of open source\footnote{\url{http://www.opensource.org/docs/osd/}} and are open-source licenses approved by the OSI. All parts of XAF that relate to the TRex integration are also licensed using the EPL~1.0 while the XQuery core of XAF is licensed under the terms of the GNU Lesser General Public License~2.1 (LGPL)\footnote{\url{http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html}}. The reason for this decision is the stronger \emph{copyleft} property of the LGPL compared to the Eclipse Public License. Hence, all contributions made to LGPL licensed source code must also be licensed using LGPL while EPL allows to license contributions using non-free licenses. The combination of the different licenses does not lead to problems regarding license compatibility as no source code was copied, shared, or linked together.


\section{Alliances of Java and XQuery} \label{impl_java_xquery}

\abbrev{XQJ}{XQuery API for Java}

This section describes ways to combine the usage of Java and XQuery. This is necessary as the patterns are described using XQuery. Indeed, the execution of the patterns is controlled by TRex which is Java-based. In the first part of this section the \emph{XQuery API for Java} is introduced which provides a standardised way of executing XQuery expressions from Java. The second part covers the other way around and describes a possibility to access Java from XQuery.

The \emph{XQuery API for Java} (XQJ)~\cite{XQJ0.9} is currently being standardised as Java Specification Request~225. The specification reached the status of a public review draft in May~2007 and the final standard is expected for the year~2008. XQJ is a ``common API that allows an application to submit queries conforming to the W3C XQuery~1.0 specification and to process the results of such queries''~\cite{XQJ0.9}. Hence, XQJ is for XQuery what the \emph{Java Database Connectivity} (JDBC) API is for relational databases. XQJ also provides a mapping of XQuery data types to Java data types and other advanced features like transactional operations. The current Java edition of Saxon implements the latest XQJ specification~0.9 and is used by XAF for the execution of the XQuery expressions.

The usage of XQJ is straightforward as listing~\ref{xqj_example} shows. The method \code{executeXQuery} (line~1) takes an XQuery expression as string parameter and returns an \code{XQResultSequence} object which represents a sequence of items. As mentioned in section~\ref{xpath} such a sequence consists of atomic values and/or nodes. It is up to the caller of the method \code{executeXQuery} to loop through the sequence and extract the desired information. In lines~2 and~3 an \code{XQExpression} object is instantiated and its method \code{executeQuery} is called in line~4. The query's result is stored in the \code{XQResultSequence} object which is returned in line~6 if it contains any items (test in line~5). Otherwise an \code{EmptySequenceException} exception is risen in line~8 to indicate that the sequence is empty.

\begin{lstlisting}[style=Java, caption=Executing an XQuery expression from Java using XQJ, label=xqj_example]
public XQResultSequence executeXQuery(String query) throws Exception {
	XQExpression expression = 
		new SaxonXQDataSource().getConnection().createExpression();
	XQResultSequence result = expression.executeQuery(query);
	if (result.next())
		return result;
	else
		throw new EmptySequenceException();
}
\end{lstlisting}

Some XQuery tools offer Java bindings which permit creating Java objects and calling Java methods from within XQuery. These non-standardised Java bindings were introduced by Saxon\footnote{\url{http://www.saxonica.com/documentation/extensibility/functions.html}} and adopted by other XQuery tools like the XML database \emph{eXist}~\cite{eXist}. Listing~\ref{xquery_java_binding} shows an XQuery expression inspired by the eXist documentation\footnote{\url{http://exist.sourceforge.net/xquery.html\#N10599}} making use of the Java bindings. The query uses the \code{java.io.File} class to return a sequence of the XML elements \code{directory} and \code{file}. These feature according \code{name} attributes reflecting the contents of the current working directory.

\begin{lstlisting}[style=Java, caption=Using Java objects and methods from within XQuery, label=xquery_java_binding]
declare namespace file="java:java.io.File";

for $current-file in file:listFiles(file:new("."))
let $name := file:getName($current-file)
return
	if (file:isDirectory($current-file)) then
		<directory name="{ $name }" />
	else
		<file name="{ $name }" />
\end{lstlisting}

In line~1 of listing~\ref{xquery_java_binding} the XQuery namespace \code{file} is bound to the Java class \code{java.io.File}. In line~3 a file object is instantiated (\code{file:new(".")}) and used as argument of the method \code{listFiles}. This style differs from the classic Java way of calling member methods. Using the XQuery Java bindings the current object instance is always passed as first parameter to the called member method. This can also be seen in line~4 and~6 for the calls of the methods \code{getName} and \code{isDirectory}. To clarify the difference the equivalent Java source code snippet is shown in listing~\ref{xquery_java_equivalent}.

In line~3 of listing~\ref{xquery_java_binding} each Java file object returned by the method \code{listFiles} is bound to the XQuery variable \code{\$current-file}. This variable is used to get the filename (line~4) and to distinguish between directories and files (line~6). Finally, a \code{directory} respectively a \code{file} element is returned including the according \code{name} attribute (lines~7 and~9).

\begin{lstlisting}[style=Java, caption=Java equivalent of listing~\ref{xquery_java_binding}, label=xquery_java_equivalent]
for (File currentFile : new File(".").listFiles()) {
	String name = currentFile.getName();
	if (currentFile.isDirectory())
		System.out.println("directory: " + name);
	else
		System.out.println("file: " + name);
}
\end{lstlisting}

The possibility to access arbitrary Java classes from within XQuery is a powerful extension mechanism. It is also used by the implementation of XAF to make use of existing methods already implemented in Java by TRex.


\section{Conversion of TTCN-3 Source Code to XML} \label{impl_ttcn3_xml}

As the framework is integrated into TRex---and this ``dinosaur'' is already capable of parsing \ttcn{} source code using ANTLR---it is self-evident to build the XML representation of \ttcn{} on these foundations. Each \ttcn{} source code file is represented as an abstract syntax tree by TRex\footnote{``In a strict sense the TRex syntax tree is not abstract as it contains elements without semantic relevance as well. Hence, it is similar to a parse tree/concrete syntax tree.''~\cite{Bisanz06}}. This tree can be accessed through the class \code{LocationAST} in the package \brcode{de.ugoe.cs.swe.trex.core.analyzer.rfparser}\footnote{For an overview of the TRex packages and plug-in structure see: \url{http://www.trex.informatik.uni-goettingen.de/trac/wiki/PluginStructure}}. The class \code{LocationAST} extends the class \code{CommonAST} provided by ANTLR and adds ``corresponding start and end offsets as well as the line in the parsed source file, the associated token in the token stream and the associated scope''~\cite{Zei06}.

The information stored in \code{LocationAST} objects fulfil the requirements to serve as representation format for the software analysis framework. The location information of the \ttcn{} source code entities can be used to map matches found by the analysis framework to the original source code. Hence, a nearly complete one-to-one mapping of \code{LocationAST} objects to XML documents is required. \ttcn{} definitions, operators, statements, and blocks (respectively their AST representations) are mapped to XML elements because elements can contain other nested elements. \ttcn{} identifier and literals are mapped to XML text nodes and location information (like the filename, line numbers, and offset information) are mapped to XML attributes.

The implementation of the XML export is done by extending the class \code{LocationAST} by a new public method \brcode{xmlSerialize(Writer stream, String fileName, boolean indentXML)} and some private helper methods. The AST is traversed recursively and all elements of the tree are mapped to according XML elements and attributes as described in the previous paragraph. The resulting XML document is written to the \code{Writer} object passed to the \code{xmlSerialize} method. The parameter \code{String fileName} must be passed as this information is not already part of the \code{LocationAST} class. The filename is added an an XML attribute to the root element \code{TTCN3File} to be able to relocate the according \ttcn{} source code file. This is required when the XML result of the analysis is mapping back to the \ttcn{} source code. The XML document can be created with indentation for better readability. This is controlled by the parameter \code{boolean indentXML}. Additionally, the attribute \code{endLine} is added to the class \code{LocationAST} which denotes the line in which the associated token ends.

Listing~\ref{xml_ttcn3_example} shows a part of the XML document of the serialised \code{LocationAST} object representing the \ttcn{} source code of listings~\ref{ttcn3_http_ports} and~\ref{ttcn3_http_testcase} from section~\ref{ttcn3core}. The XML snippet in listing~\ref{xml_ttcn3_example} represents this source code: \code{template charstring HTTPResponse := pattern "HTTP/1.1 \char92d\char92d\char92d *";} (it is taken from line~9 in listing~\ref{ttcn3_http_testcase} from section~\ref{ttcn3core}). Definitions, blocks, and types are represented as XML elements. Identifier and literals as XML text nodes, and location information as XML attributes. For reasons of clarity only the attributes of the \code{ModuleDefinition} element in line~1 are included. Normally each element features the attributes \code{line}, \code{offset}, and \code{offset-end}. The attribute \code{line-end} is only added if it is different from the attribute \code{line} to avoid information duplication. As the template in this example is defined in one line, the attribute \code{line-end} is not included.

\begin{lstlisting}[style=XML, caption=Partial XML representation of a \code{LocationAST} object, label=xml_ttcn3_example]
<ModuleDefinition line="9" offset="234" offset-end="297">
	<TemplateDef>
		<BaseTemplate>
			<Type><PredefinedType>charstring</PredefinedType></Type>
			<Identifier>HTTPResponse</Identifier>
		</BaseTemplate>
		<TemplateBody>
			<SingleValueOrAttrib>
				<MatchingSymbol>
					<CharStringMatch>HTTP/1.1 \d\d\d *</CharStringMatch>
				</MatchingSymbol>
			</SingleValueOrAttrib>
		</TemplateBody>
	</TemplateDef>
</ModuleDefinition>SemiColon
\end{lstlisting}

The method \code{xmlSerialize} of the \code{LocationAST} class is used by the main XAF plug-in \brcode{de.ugoe.cs.swe.trex.xaf} to create one global XML document representing all \ttcn{} files that should be analysed. This is performed by the method \code{createXML} of the class \code{Analysis}. The \code{createXML} method iterates over all files to be analysed and instantiates a \code{TTCN3Analyzer} object for each file. The analyser object permits access to the \code{LocationAST} object and on which finally the \code{xmlSerialize} method is called. The resulting global XML document is written to a file named \code{ttcn3.xml} which is used as input for the XQuery facade layer. Listing~\ref{xml_ttcn3_example2} shows the global structure of such an XML document. One XML file representing multiple \ttcn{} files allows analyses crossing the file borders but can also lead to large file sizes. The file sizes for the XML representation of the TRex AST can be estimated by 1.6 megabyte per thousand lines of \ttcn{} source code.

\begin{lstlisting}[style=XML, caption=Structure of the global XML document, label=xml_ttcn3_example2]
<TTCN3Files>
	<TTCN3File filename="/folder/moduleFoo.ttcn3">
		<!-- [...] -->
	</TTCN3File>
	<TTCN3File filename="/folder/moduleBar.ttcn3">
		<!-- [...] -->
	</TTCN3File>
	<!-- [...] -->
</TTCN3Files>
\end{lstlisting}


\section{XQuery Facade Layer for TTCN-3} \label{impl_facade}

This section describes the implementation of the XQuery-based facade layer according to its design presented in section~\ref{design_xquery}. The purpose of the facade layer is to provide an abstraction from the underlying representation format respectively analysis target and a decoupling of the analysis plug-ins from the underlying layers.

The XQuery module \brcode{facade.ttcn3.trex.xquery} and all other XQuery and XML files belonging to XAF are contained in the Eclipse plug-in \brcode{de.ugoe.cs.swe.xaf}. The naming of the plug-in already states that XAF is neither part of TRex nor depends on it. Indeed, the facade module is customised to provide access to \ttcn{} source code provided by TRex. This is reflected in the naming of the XQuery module file (\brcode{facade.ttcn3.trex.xquery}\footnote{XAF facade namespaces and module files should be named according to the pattern \brcode{facade.analysis-target.representation-format[.xquery]}. For instance, \brcode{facade.java.srcml.xquery} for a facade module file providing access to Java source code using the representation format srcML.}) and the module namespace (\brcode{de.ugoe.cs.swe.xaf.facade.ttcn3.trex}, see listing~\ref{impl_facade_1}, line~1).

Listing~\ref{impl_facade_1} shows the header of the facade layer module for the access to the XML-encoded \ttcn{} source code provided by TRex. Line~1 contains the mentioned namespace declaration, followed by the declaration of the external variable \code{\$xfacade:path} in line~3. The value of this variable needs to be provided by the XQuery processor and is the absolute path to the input XML file \code{ttcn3.xml}. The XML structure contained in this file is loaded into the global module variable \code{\$xfacade:root} in line~4 using the \code{doc} function. This variable is used by the facade functions to provide a channelled access to the underlying representation format.

Instead of passing the path to the XML file as an external variable to the XQuery module, the XML document itself could have been passed directly as an external variable. The presented solution is chosen because performance tests revealed that passing the document directly is considerably slower than passing the path and reading the file using the \code{doc} function.

\begin{lstlisting}[style=XQuery, caption=Header of the XQuery facade layer module for \ttcn{}, label=impl_facade_1]
module namespace xfacade = "de.ugoe.cs.swe.xaf.facade.ttcn3.trex";

declare variable $xfacade:path external;
declare variable $xfacade:root := doc($xfacade:path);
\end{lstlisting}

Listing~\ref{impl_facade_2} is the continuation of the preceding listing~\ref{impl_facade_1} and contains two exemplary facade functions (lines~1--7) and the helper function \code{get-node} (lines~9--14). The two facade functions \code{get-files} and \code{get-functions} provide access to the XML element \code{TTCN3File} which represents a \ttcn{} source code file respectively to the element \code{FunctionDef} which represents a \ttcn{} function definition. Both functions feature a parameter \code{\$n} of the type \code{node()*} (zero or more items of the type node) and do not directly access the global module variable \code{\$xfacade:root} but instead pass the parameter \code{\$n} to the helper function \code{get-node} (lines~2 and~6) to access the underlying XML document. 

This strategy allows to control the scope of facade functions. If for instance \code{get-functions} is called and the parameter \code{\$n} is the empty sequence (\code{get-functions(())}), all function definitions are returned. On the other hand, if parameter \code{\$n} is for instance bound to a \code{TTCN3File} element, only the function definitions of this \ttcn{} file are returned. This behaviour is implemented in the helper method \code{get-node} which returns either the variable \code{\$xfacade:root} (line~11) if \code{\$n} is empty or otherwise \code{\$n} itself (line~13).

\begin{lstlisting}[style=XQuery, caption=Facade functions of the \ttcn{} facade module, label=impl_facade_2]
declare function xfacade:get-files($n as node()*) {
	xfacade:get-node($n)//TTCN3File
};

declare function xfacade:get-functions($n as node()*) {
	xfacade:get-node($n)//FunctionDef
};

declare function xfacade:get-node($n as node()*) {
	if (empty($n)) then
		$xfacade:root
	else
		$n
};
\end{lstlisting}

The complete facade module currently consists of about 70~facade functions and their source code is included as appendix~\ref{appendix_xquery_facade}. For a better overview, the function are structured in groups. The first group contains facade functions like \code{get-files}, \code{get-functions}, and \code{get-identifiers} which are common for most programming languages. The second group consists of functions specific to \ttcn{} such as \code{get-testcases} and \code{get-alt-constructs}. The facade functions are not complete in the way that all kinds of information of the underlying XML tree could be accessed through them. The currently implemented functions providing access to the most important parts of the \ttcn{} representation. But as seen above, the functions of the facade layer are quite simple and therefore missing ones could simply be added as the framework grows. 

The third group of facade functions is a special case and contains functions calling AST-related Java methods of TRex. This is realised using the Java bindings provided by Saxon which are described in section~\ref{impl_java_xquery}. The direct access to the AST using the TRex APIs is preferred over the access to the XML structure, as TRex already implements AST-related analysis methods like a scope-aware reference finder. Repeating these implementations using XQuery would have required additional effort without significant benefits. The use of the direct access to the AST is currently only used by two smell detection functions to discover single references. All other analysis functions are written using pure XQuery expressions.

The following details regarding the usage of Java methods of TRex are strictly related to the facade layer. Hence, they are presented here and not in section~\ref{impl_trex} which discusses the integration of XAF into TRex. The Java methods called by the facade module are located in the class \code{TRexAST} of the main XAF plug-in \brcode{de.ugoe.cs.swe.trex.xaf}. This main XAF plug-in requires the Saxon plug-in \brcode{de.ugoe.cs.swe.saxon} to make use of Saxon to execute XQuery expressions. Java calls from XQuery modules are executed by Saxon and therefore Saxon needs to find the according classes. Normally the Java classpath is used for this purpose, but the Eclipse plug-in concept differs from the classpath approach. To permit Saxon to find the class \code{TRexAST} in the plug-in \brcode{de.ugoe.cs.swe.trex.xaf} it is \emph{not} possible to let the Saxon plug-in depend on the XAF plug-in because this would lead to a plug-in dependency cycle. The solution is the \emph{Buddy Class Loading}\footnote{\url{http://wiki.eclipse.org/index.php/Context\_Class\_Loader\_Enhancements\#Buddy\_Class\_Loading}, \url{http://www.eclipsezone.com/articles/eclipse-vms/}} concept of Eclipse. For instance, this allows plug-in \code{A} to register at plug-in \code{B}. If plug-in \code{A} cannot find a class in its own scope, it looks for the class at all registered plug-ins, in this example plug-in \code{B}. This behaviour is controlled by entries in the \code{MANIFEST.MF} files. Hence, the file \code{MANIFEST.MF} of the Saxon plug-in is modified to permit other plug-ins to register themselves at the Saxon plug-in by adding the line \code{Eclipse-BuddyPolicy: registered}. The manifest file of the XAF plug-in is also modified to register itself at the Saxon plug-in by adding the line \brcode{Eclipse-RegisterBuddy: de.ugoe.cs.swe.saxon}. The Saxon plug-in is now able to find the \code{TRexAST} class in the XAF plug-in.

\paraskip

Figure~\ref{xaf_facade} summarises the tasks of the XQuery facade layer module and shows its central position in the framework. The facade layer is used by the smell detection module to access the XML representation of the TRex AST and additionally to directly access the TRex AST by utilising the TRex API.

\input{pstricks/ch5_facade}


\section{XQuery Pattern Description of Bad Smells} \label{impl_xquery_smells}

The description of patterns using XQuery is the actual core of XAF. As the framework is customised for the detection of bad smells, this section shows how XQuery is used to describe and detect bad smells in \ttcn{} source code. The section is structured as follows: the main focus is set to the detailed presentation of a few sample XQuery functions for the detection of bad smells. This is succeeded by XQuery implementation-related features, a list of all currently available XQuery smell detection functions, and the presentation of the XML file \brcode{smells.ttcn3.xml} describing the structure of the bad smell detection module.

Listing~\ref{impl_smell_1} shows the XQuery function to detect instances of the bad smell \emph{Long Parameter List} (see section~\ref{ttcn3smell} for details on this smell). The function \code{long-parameter-list} is parametrised with the \code{integer} parameter \code{\$floor}. The value of \code{\$floor} is the minimal number of parameters required to mark a parameter list as too long. \ttcn{} allows not only functions to have a parameter list but also templates, testcases, typedefs, external functions, signatures, and altsteps. Therefore the parameter lists of those constructs are also included in the analysis. For reasons of clarity the presented XQuery function only takes functions and templates into account. Hence, in line~4 all function and template definitions are joined into one sequence. Using a \code{for} loop, each item of the sequence is bound to the variable \code{\$parametrizable-construct} one after another in line~3. Its parameters are stored in the variable \code{\$parameters} in line~5, and the number of parameters is compared to the number of required parameters to match. This is done using the \code{where} condition in line~6 which checks if the number of parameters (\code{count(\$parameters)}) is greater or equal to the value of the variable \code{\$floor}. If this condition is true, an instance of the bad smell is found and returned in line~7.

\begin{lstlisting}[style=XQuery, caption=XQuery function to detect the bad smell \emph{Long Parameter List}, label=impl_smell_1]
declare function xsmell:long-parameter-list($floor as xs:integer)
{
	for $parametrizable-construct in
		(xfacade:get-functions(()), xfacade:get-templates(()))
	let $parameters := xfacade:get-parameters($parametrizable-construct)
	where count($parameters) >= $floor
	return $parametrizable-construct
};
\end{lstlisting}

Listing~\ref{impl_smell_2} shows the second sample XQuery function which is called \code{missing-log}. It describes the bad smell \emph{Missing Log} which is a \ttcn-specific smell that occurs if ``setverdict is used with verdict inconc or fail, but without calling log''~\cite{Bisanz06}. The function \code{missing-log} iterates over all \code{setverdict} statements (line~3), stores the according type of the verdict in the variable \code{\$verdict-type} (line~4), and the parent block of the \code{setverdict} statement in the variable \code{\$parent-block} (line~5). The \code{where} condition in lines~6--8 ensures that in the block of the \code{setverdict} statement no log operation is contained and that the verdict type is fail or inconc (lines~7--8). If both conditions are true (no log operation and verdict type fail or inconc) an instance of the bad smell \emph{Missing Log} is found and the verdict is returned in line~9.

\begin{lstlisting}[style=XQuery, caption=XQuery function to detect the bad smell \emph{Missing Log}, label=impl_smell_2]
declare function xsmell:missing-log()
{
	for $verdict in xfacade:get-setverdicts(())
	let $verdict-type := xfacade:get-verdict-type($verdict)
	let $parent-block := xfacade:get-parent-block($verdict)
	where not(xfacade:get-log-ops($parent-block)) 
	  and (xfacade:verdict-is-fail($verdict-type)
	   or xfacade:verdict-is-inconc($verdict-type))
	return $verdict
};
\end{lstlisting}

\label{smell_unreachable_default}

Listing~\ref{impl_smell_3} shows slightly simplified version of the smell detection function \code{unreachable-default}. It describes the \ttcn-related bad smell \emph{Unreachable Default} which occurs if an alt statement contains an else branch while a default is active. This functions makes use of the XQuery data model which keeps all nodes in the document order. This allows the function to detect if a default is activated when an else branch is executed. The function loops over all block which contain default declarations in line~3. In lines~6--8 the else branch and the activation/deactivation statements of the default are bound to corresponding variables. If the activate statement is followed by the else branch which is followed by the deactivation of the default, an instance of this bad smell is found. The according check is performed in line~10.

\begin{lstlisting}[style=XQuery, caption=XQuery function to detect the bad smell \emph{Unreachable Default}, label=impl_smell_3]
declare function xsmell:unreachable-default()
{
	for $block in xfacade:get-blocks(())[xfacade:get-defaults(.)]
	return
		let $alt  := xfacade:get-alt-constructs($block)
		let $else := (xfacade:get-else-statements($alt))[1]
		for $activate   in xfacade:get-activate-ops($block)
		for $deactivate in xfacade:get-deactivate-ops($block)
		(: else statement must be surrounded by activate and deactivate :)
		where $activate << $else and $else << $deactivate
		return $activate
};
\end{lstlisting}

XAF also implements an infrastructure for the clone detection of duplicated code. The actual comparison functions are located in a library module and their application is presented to detect instances of the bad smell \emph{Duplicate Alt Branches}. Listing~\ref{impl_smell_4} shows a slightly simplified version of the XQuery function describing this bad smell. The workflow for each duplicate search is related: first the scope of the comparison is estimated. In this case duplicate branches contained in altsteps and alt constructs must be compared. Hence, the for loop in line~3 interates over all altsteps and alt constructs and selects the alt branches of the current scope. The actual comparison is performed by the function \code{find-duplicates} in line~5 which returns a sequence of duplicates. As this sequence may contain multiple kinds of duplicates\footnote{Using a non-XML syntax to describe this: the sequence might look like (dup1, dup1, dup2, dup2, dup2). To ease the subsequent processing this sequence is transformed to look like ((dup1, dup1), (dup2, dup2, dup2)).} the duplicates belonging together are grouped by the function \code{group-duplicates} and returned in line~6.

\begin{lstlisting}[style=XQuery, caption={[XQuery function to detect \emph{Duplicate Alt Branches}]XQuery function to detect the bad smell \emph{Duplicate Alt Branches}}, label=impl_smell_4]
declare function xsmell:duplicate-alt-branches()
{
	for $scope in (xfacade:get-altsteps(()), xfacade:get-alt-constructs(()))
	let $to-compare := xfacade:get-alt-branches($scope)
	let $duplicates := xlib:find-duplicates($to-compare)
	return xlib:group-duplicates($duplicates)
};
\end{lstlisting}

The duplicate search can simply be adapted to find other types of duplicate code by changing the scope and the elements to compare in the lines~3 and~4. It is also possible to omit the scope completely and to compare elements in the complete XML document what allows to find duplicates that are located in different \ttcn{} source code files. But these kinds of duplicate searches are quite expensive regarding running time as each element must be compared to each other. 

The precise number of comparisons that are performed by the function \code{find-duplicates} is $\sum_{i=1}^{n} i + \ldots + (n-i) \Rightarrow \frac{n*(n-1)}{2}$ whereat \emph{n} is the number of elements to compare. Using the O-notation the maximal effort can be estimated by an upper boundary of $O(\frac{n^{2}}{2}) \Rightarrow O(n^{2})$~\cite{knuth}.

The implemented XQuery functions for the detection of duplicated code currently support two comparison modes. The default mode returns only strictly matching duplicates while the second mode returns those elements as duplicates whose XML subtrees contain the same elements. Text nodes and attributes are not taken into account for the comparison using this mode. As only identifiers and literals are stored as text nodes, this strategy allows to find duplicates with differing arguments. For example, the two alt branches shown in listing~\ref{ttcn3_dup_alt} are considered as duplicates---even though the arguments of the functions \code{receive} and \code{log} are different. The desired comparison behaviour is controlled via a parameter of the clone detection functions which is left out in listing~\ref{impl_smell_4}.

\begin{lstlisting}[style=TTCN3, caption={\ttcn{} alt branches considered as duplicates}, label=ttcn3_dup_alt]
[] InputPort.receive(ErrorFoo) {
	setverdict(fail);
	log("received error foo");
}
[] InputPort.receive(ErrorBar) {
	setverdict(fail);
	log("received error bar");
}
\end{lstlisting}

The presented sample XQuery smell detections functions clarify the advantages of using XQuery for the pattern description. Easy readability of the patterns is achieved by using descriptive variable names and avoiding deep nesting of function calls. The usage of library functions and facade functions instead of XML element and attribute names also improves the comprehensibility of the patterns.

\paraskip

The described XQuery functions return their matches as XML subtrees of the global XML document \code{ttcn3.xml}. As discussed in section~\ref{design_matches} only the information required for locating the matches in the analysis target need to be returned. Thus, for each analysis function a corresponding function is declared which returns the match formated according to the XML schema \code{matches.xsd}. Listing~\ref{impl_smell_match} shows a pair of two functions called \code{missing-log}. The first one is the analysis function, the second function formats the match as desired. In lines~1--4 the function of listing~\ref{impl_smell_2} is repeated but the function body is left out as it is not necessary for this context. In lines~6--9 the second function named \code{missing-log} is located. The only purpose of this function is to take the additional parameter \code{\$name} and call the function \code{format-match} with the result of the first \code{missing-log} function and the parameter \code{\$name}. The \code{format-match} function extracts all required information from the complete match(es) and returns a sequence of \code{match} elements conforming to the \code{matches.xsd} schema. The value of the parameter \code{\$name} should be set by the caller to the function name itself (for instance \code{xsmell:missing-log}) to include this value in the \code{match} element as \code{found-by} attribute. This allows the subsequent application to distinguish which match element was created by which function. Unfortunately, neither XQuery itself nor a Saxon extension supports reflection mechanisms which would have allowed to get the current function name from inside of XQuery.

\begin{lstlisting}[style=XQuery, caption={[Two XQuery functions per analysis]Two XQuery functions per analysis: One returning complete matches, one returning \code{match} elements conforming to the \code{matches.xsd} schema}, label=impl_smell_match]
declare function xsmell:missing-log()
{
	[...]
};

declare function xsmell:missing-log($name as xs:string)
{
	xfacade:format-match(xsmell:missing-log(), $name)
};
\end{lstlisting}

Currently 20~XQuery analysis functions for the detection of bad smells are implemented. All of them are described in the \ttcn{} code smell catalogue. The implemented analysis function cover all categories of the code smell catalogue but the category data flow anomalies. The following analysis functions are not \ttcn-specific as the bad smells can also occur in other programming languages (ordered by name): \brrcode{duplicate-code-in-conditionals}, \brrcode{goto}, \brrcode{long-parameter-list}, \brrcode{long-statement-block}, \brrcode{magic-number}, and \brrcode{nested-conditional}. The following smell detection functions are specific to \ttcn{} (ordered by name): \brrcode{deactivation-other-level}, \brrcode{duplicate-alt-branches}, \brrcode{fully-parameterized-template}, \brrcode{idle-ptc}, \brrcode{missing-activation}, \brrcode{missing-deactivation}, \brrcode{missing-log}, \brrcode{missing-verdict}, \brrcode{short-template} (based on the number of characters), \brrcode{short-template2} (based on the number of template fields), \brrcode{singular-component-element-reference}, \hspace{0.2mm} \brrcode{singular-template-reference}, \hspace{0.2mm} \brrcode{stop-in-function}, and \brrcode{unreachable-default}.

% the \hspace{0.2mm} hacks above are ugly but working fine!

For details on the bad smells and their implementation, see the comments in the XQuery module \brcode{smells.ttcn3.xquery}, the according XML file \brcode{smells.ttcn3.xml}, and the \ttcn{} code smell catalogue. Additionally, the source codes of all XQuery analysis functions are included as appendix~\ref{appendix_xquery_generic} (generic smell detection functions) and appendix~\ref{appendix_xquery_ttcn3} (smell detection functions for \ttcn).

\paraskip

As discussed in section~\ref{design_hierarchy} XQuery modules are unstructured lists of XQuery functions which is not suitable for a framework as the embedding application cannot easily make use of the modules. Hence, the structure of modules is described in according XML documents conforming to the XML schema definition \code{hierarchy.xsd}. Listing~\ref{impl_smell_hierarchy} contains a snippet of the file \code{smells.ttcn3.xml} which describes the hierarchical structure of the smell detection module \brcode{smells.ttcn3.xquery}. As the allowed XML elements and attributes of hierarchy XML documents are presented in detail in section~\ref{design_hierarchy}, listing~\ref{impl_smell_hierarchy} is only briefly described. Lines~1--6 contain the root element \code{hierarchy} and its attributes to find the according XQuery module file and to access it with correct namespace information. The remainder of the document (lines~7--15) consists of a \code{category} element containing two \code{function} elements. The second \code{function} element features a \code{parameter} element and according attributes.

\begin{lstlisting}[style=XML, caption={[Snippet of the file \code{smells.ttcn3.xml}]File \code{smells.ttcn3.xml} describing the structure of the smell detection module \brcode{smells.ttcn3.xquery}}, label=impl_smell_hierarchy]
<hierarchy xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:noNamespaceSchemaLocation="hierarchy.xsd"
	name="Bad Smell Detection for TTCN-3 Source Code"
	filename="smells.ttcn3.xquery"
	namespace-prefix="xsmell"
	namespace="de.ugoe.cs.swe.xaf.smells.ttcn3">
	<category name="Test Behavior">
		<function name="Missing Log" xquery-name="missing-log"/>
		<function name="Stop In Function" xquery-name="stop-in-function">
			<parameter 
				name="Exclude functions running on a component" 
				xquery-name="skip-runs-on" type="boolean"/>
		</function>
	</category>
</hierarchy>
\end{lstlisting}



\section{Integration into TRex} \label{impl_trex}

Before going into the details of the integration, an overview of the complete framework architecture including its embedding into TRex is given. Figure~\ref{xaf_overview} shows the global structure of the XQuery-based analysis framework customised for the detection of bad smells in \ttcn{} test suites and integrated into TRex. While figure~\ref{xaf_facade} is focused on the position of the facade layer, figure~\ref{xaf_overview} includes all relevant parts of the framework. Only details regarding the integration into the TPTP static analysis framework are left out. The three boxes in figure~\ref{xaf_overview} at the right bottom of the figure illustrate the core framework: the XQuery facade layer, XQuery smell detection module, and XQuery library modules. The latter contain common functions which are not related to software analysis and therefore no further described. The round box at the left bottom is the starting point for the usage of XAF and the interface where the framework is coupled with TPTP. The three boxes on the right top show, starting with the topmost box, the analysis target (\ttcn{} source code) and its representation formats (AST of the \ttcn{} source code and the XML representation of the AST). The box on the left side shows the direct access from the facade layer through TRex APIs to the AST passing by the XML representation format which is described in detail at the end of section~\ref{impl_facade}.

\input{pstricks/ch5_overview}

The integration of XAF into TRex is described in this section. First a narrow description of preparations regarding updating existing TRex plug-ins is given. Afterwards the actual integration of XAF into TRex using the static analysis framework of TPTP is discussed.

The Eclipse plug-ins \brcode{de.ugoe.cs.swe.trex.patterndetection.*} created by Bisanz~\cite{Bisanz06} in 2006 serve as a starting point for the integration of XAF into TRex as they are already built upon the TPTP static analysis framework. These pattern detection plug-ins shall be preserved and it should be possible to use them in conjunction with the newly created XAF plug-ins. Additionally, as much as possible TPTP-related functionality should be shared between the pattern detection and XAF plug-ins to avoid code duplication.

The fist step was an update of the existing pattern detection plug-ins to the current TPTP~4.4.0 branch which required mostly adoptions of TPTP interface changes. In the second step, functionality that should be shared between the XAF and pattern detection plug-ins was identified. As a result of this, the plug-in \brcode{de.ugoe.cs.swe.trex.tptp} was created which contains a common base class for analysis results and declarations of shared constants. Additionally, the plug-in \brcode{de.ugoe.cs.swe.trex.patterndetection.ui} no longer depends on the core plug-in of the pattern detection and can now serve as the user interface for the pattern detection and XAF plug-ins. These changes allow to combine both analysis plug-ins and to perform analyses including rules from both plug-ins at the same time.

\paraskip

The XAF integration into TRex is located in the plug-in \brcode{de.ugoe.cs.swe.trex.xaf} which contains all classes required for the usage of the TPTP static analysis framework. The most important classes are briefly described in table~\ref{xaf_classes}. The entry point to the static analysis framework is the XAF class \code{AnalysisProvider} which extends the class \code{AbstractAnalysisProvider} of TPTP. This relationship is reflected by the \code{plugin.xml} file shown in listing~\ref{xaf_plugin_xml}. The TPTP extension point in line~2 is extended by the XAF analysis provider in lines~3--7. A link to the \code{AnalysisProvider} class is provided in line~4, a unique identifier and a human readable label for the provider are stored in lines~5 and~6. In line~7 the link to user interface implementation of a viewer class is provided. As described above, XAF and the pattern detection plug-ins by Bisanz make use of the shared viewer class \brcode{de.ugoe.cs.swe.trex.patterndetection.ui.PatternDetectionViewer}.

\begin{lstlisting}[style=XML, caption=File \code{plugin.xml} of the XAF plug-in for TRex, label=xaf_plugin_xml]
<plugin>
	<extension point="org.eclipse.tptp.platform.analysis.core.analysisProvider">
		<analysisProvider 
			class="de.ugoe.cs.swe.trex.xaf.AnalysisProvider"
			id="de.ugoe.cs.swe.trex.xaf.provider" 
			label="XQuery-based Analysis Framework for TTCN-3"
			viewer="de.ugoe.cs.swe.trex.patterndetection.ui.PatternDetectionViewer"/>
	</extension>
</plugin>
\end{lstlisting}

\begin{table}
\centering
\label{xaf_classes}
	\begin{tabular}{lp{8.5cm}}
	\rowcolor{lightgray}
	\headcolumn{Class Name}		& \headcolumn{Functionality} \\
	\hline \noalign{\vspace{1mm}}
	\code{AnalysisProvider}		& Creates TPTP categories and rules and controls the execution of analysis runs. \\
	\code{AnalysisRule}		& Represents XQuery functions as TPTP rules. \\
	\code{Analysis}			& Represents analysis runs. \\
	\code{XafResults}			& Represents XML results of XAF as Java object. \\
	\hline
	\end{tabular}
\caption{Important classes of the \brcode{de.ugoe.cs.swe.trex.xaf} plug-in}
\end{table}

The \code{AnalysisProvider} class fulfils the following tasks: it is responsible for the creation of the TPTP categories and rules based on the contents of the file \brcode{smells.ttcn3.xml}. Additionally, the provider controls the workflow of analysis runs. The implementation of both tasks is described in the following.

The XQuery-based analysis framework describes the structure of XQuery analysis modules in XML files conforming to the \code{hierarchy.xsd} schema. This allows the structure of modules to be described independent of any third-party standards and dependencies. Hence, the class \code{AnalysisProvider} needs to parse the file \brcode{smells.ttcn3.xml} to create a TPTP category for each category defined in the XML file and to create a TPTP rule for each function defined in the XML file. This is done on-the-fly the first time the TPTP analysis dialogue is opened inside a TRex instance.

The TPTP static analysis framework is not designed for adding categories and rules at runtime. Normally they are statically declared as TPTP extensions in the file \code{plugin.xml}. To work around this limitation, the method \code{getOwnedElements} of the abstract provider class is overwritten and used to insert the categories and rules on-the-fly. The first time the \code{getOwnedElements} method is called, the file \brcode{smells.ttcn3.xml} is parsed and all TPTP categories and rules are created according to contents of the XML file. 

Without going into details, the process of creating TPTP categories and rules on-the-fly is described in this paragraph. First the file \brcode{smells.ttcn3.xml} is parsed, made available as DOM tree, and validated against its XML schema definition. Afterwards the DOM tree is traversed recursively and TPTP categories and rules are created. For each \code{category} element of the XML file an according \code{DefaultAnalysisCategory} object is instantiated. Each XML \code{function} element---which describes an XQuery analysis function---is mapped to an \code{AnalysisRule} object which is provided by XAF. The \code{AnalysisRule} object stores the function name of its corresponding XQuery to enable a reverse mapping from the analysis rule to the corresponding XQuery function. To complete the mapping of the XML file to the static analysis framework of TPTP, \code{parameter} elements of the XML file are mapped to TPTP \code{AnalysisParameter} objects.

\paraskip

The control of the workflow of an analysis run---the execution of multiple analysis rules---is also located in the \code{AnalysisProvider} class and consists of the following steps.

\begin{description}
\item[Step 1:] The scope of the analysis run is determined. All \ttcn{} source code files respectively projects or a subsets of these can be selected for an analysis run. The scope is set by the user through an interface provided by TPTP.
\item[Step 2:] The XML representation is created which reflects the abstract syntax trees of all files of the analysis scope.
\item[Step 3:] The hierarchy of the TPTP categories and rules is traversed to collect the information which rules are selected by the user and need therefore to be included in the analysis run. Based on these information an according XQuery expression is built.
\item[Step 4:] The XQuery expression created in the preceding step is executed, the resulting XML is parsed and stored in an \code{XafResults} object.
\item[Step 5:] Finally, the matches contained in the \code{XafResults} object are transformed into TPTP results to allow to display them using the TPTP user interface.
\end{description}

The starting point of each analysis run is the method \code{analyze} located in the class \code{AnalysisProvider} which is called by the TPTP static analysis framework. The implementation of the five steps is briefly described in the following. 

Step~1 is completely done by the TPTP static analysis framework: the method \code{getFilteredResources} returns a list of all required \ttcn{} \code{IFile} objects. This list reflects the scope of the analysis run. Step~2, the generation of the XML representation, is realised by the \code{createXML} method of the XAF \code{Analysis} class. It covers all required subordinate steps like parsing the \ttcn{} source code files and calling the \code{xmlSerialize} method of each \code{LocationAST} object. In the end an XML file named \code{ttcn3.xml} is created which is the input for the software analysis. 

Step~3---building the XQuery expression based on the selected TPTP rules---is quite complex. As motivation an example of the desired result is given in listing~\ref{xquery_sample_call}. It shows an XQuery expression calling two analysis functions from the module \brcode{smells.ttcn3.xquery}. The execution of this expression by the XQuery processor is the actual XQuery-based analysis. The first four lines of the expression import the required smell detection and library modules. This is followed by the calls of the analysis functions \code{missing-log} and \code{long-parameter-list} in lines~7 and~8. Both function calls are wrapped into a call to the XQuery helper function \code{finalize-query} (line~6) which ensures that the returned results conform to the XML schema \code{matches.xsd}.

\begin{lstlisting}[style=XQuery, caption=XQuery expression calling two analysis functions, label=xquery_sample_call]
import module namespace xsmell = 
	'de.ugoe.cs.swe.xaf.smells.ttcn3' at 'smells.ttcn3.xquery';
import module namespace xlib = 
	'de.ugoe.cs.swe.xaf.library' at 'library.xquery';

xlib:finalize-query((
	xsmell:missing-log('xsmell:missing-log'),
	xsmell:long-parameter-list(4, 'xsmell:long-parameter-list')
))
\end{lstlisting}

To build such an XQuery expression it must first be determined which TPTP rules are selected by the user and should be included in the expression. This is done by the helper method \code{traverseCategories} which recursively traverses the hierarchy of all TPTP categories/rules and calls the method \code{generateXQueryFunctionCall} for each object \code{AnalysisRule} selected by the user. This generates a list of function calls---like the list of two calls in lines~7 and~8 in listing~\ref{xquery_sample_call}---which is stored in the class \code{Analysis}.

The mapping of TPTP rule parameter values to according XQuery parameter values is also required. The first parameter of the function call \code{long-parameter-list} in line~8 has the value \code{4}. This value originates from the TPTP analysis dialogue in which the user entered the value. Hence, these parameter values need to be mapped from Java to XQuery values. For instance, a checkbox of the TPTP dialogue is represented by the Java type \code{boolean}. If the value of such a Java \code{boolean} object is \code{true} it is mapped to the XQuery type \code{xs:boolean} and value \code{true()}. Parameter mapping is currently supported from the Java types \code{boolean}, \code{String}, and \code{Integer} to the according XQuery types.

Step~4 of each analysis run is the actual execution of the XQuery expression created in the preceding step. The analysis provider thus calls the method \code{executeXQuery} of the \code{Analysis} class. This method adds all required namespace import statements as shown in lines~1--4 of listing~\ref{xquery_sample_call} and also adds the call the XQuery function \code{finalize-query} (line~6). Additionally, it sets the value of the external XQuery variable \code{\$xfacade:path} (listing~\ref{impl_facade_1}, line~3) of the facade layer module. The content of this variable is the absolute path to the XML input file \code{ttcn3.xml} generated in step~2. This allows the facade layer module to access and read the XML file and to provide access to this representation format through its functions. Finally, the XQuery expression is executed using the XQuery API for Java (XQJ) provided by Saxon (see sections~\ref{impl_software} and~\ref{impl_java_xquery} for details on both). The resulting XML which conforms to the \code{matches.xsd} schema is parsed and transformed to a Java object of the class \code{XafResults} which provides simplified access to all matches and their attributes.

In the final step~5, the matches contained in the \code{XafResults} object are mapped to TPTP-based \code{AnalysisResult} objects. This is required to make use of the user interface provided by TPTP to present the results to the user. This again requires recursively traversing the hierarchy of the TPTP categories/rules using the helper method \code{traverseCategories} and calling the method \code{generateResult} for each rule. The reason for these traversals lays in the way XAF makes use of the static analysis framework. Normally each rule object features an \code{analyze} method which is called once for each analysis run, performs the analysis, and immediately returns its results. For example, this workflow is used by the pattern detection plug-ins by Bisanz~\cite{Bisanz06}. XAF could have adopted this, but for reasons of performance the described process of five steps was chosen. Otherwise each \code{AnalysisRule} object would have executed its own XQuery. This would have required parsing the XML input file for each analysis function. This would have slowed down the analysis obviously and needlessly---especially for large XML input files.

\paraskip

While figure~\ref{xaf_overview} is organised XAF-centric, the following figure~\ref{impl_arch_trex} is TRex-centric and shows integration of XAF into TRex. On the top, the different TRex plug-ins are listed like the XAF for \ttcn{} plug-in and the Java-based pattern detection plug-in. Both make use of the static analysis framework of TPTP which in turn is based on Eclipse. On the right side, some of the TRex core functionalities are listed. On the left side, the XQuery-based analysis framework itself and the XQuery processor Saxon are displayed.

\begin{figure}[h]
\label{impl_arch_trex}
\centering
\includegraphics[width=0.93\textwidth]{images/architecture.eps}
\caption[TRex overall architecture]{TRex overall architecture including the XQuery-based analysis framework}
\end{figure}

The following Eclipse plug-ins are modified respectively created during the implementation of XAF and the integration into TRex.

\begin{itemize}
\item The plug-in \code{de.ugoe.cs.swe.trex.patterndetection.ui} is modified to act as user interface component for the the pattern detection plug-in and the XAF TRex plug-in at the same time.
\item The plug-in \code{de.ugoe.cs.swe.trex.patterndetection.tests} is modified to be extensible and act as basis for the XAF tests. See section~\ref{impl_tests} for details on this.
\item The plug-in \code{de.ugoe.cs.swe.trex.tptp} is introduced to share TPTP-related code between the pattern detection plug-in and the XAF plug-in using TPTP.
\item The plug-in \code{de.ugoe.cs.swe.xaf} is created to contain the XAF core of XQuery modules and related XML files. 
\item The plug-in \code{de.ugoe.cs.swe.trex.xaf} makes use of the preceding plug-in and is created to separate the XAF core from its integration into TRex. The TPTP-related classes described in this section~\ref{impl_trex} are contained in this plug-in.
\item The plug-in \code{de.ugoe.cs.swe.trex.xaf.ui} contains the implementation of user-defined queries for TRex which are based on XAF. The implementation is described in detail in the following section~\ref{impl_udq}.
\item The plug-in \code{de.ugoe.cs.swe.saxon} wraps the Saxon packages into an Eclipse plug-in to be able to make use of Saxon from within of Eclipse.
\end{itemize}


\section{User-Defined Queries} \label{impl_udq}

\abbrev{SWT}{Standard Widget Toolkit}
\abbrev{UI}{User Interface}

This section covers the implementation of XAF user-defined queries for TRex. Extending XAF itself with new analysis functions is possible by adding new XQuery modules or by adding additional functions to existing XQuery modules. As this is not very comfortable for end-users, a user interface (UI) for creating and executing user-definition queries is provided. The plug-in \brcode{de.ugoe.cs.swe.trex.xaf.ui} contains the implementation which allows TRex users to enter and execute custom XQuery expression and to search for patterns in \ttcn{} source code using the XAF infrastructure. The search dialogue itself is located in the search menu of TRex and can alternatively be opened using the keyboard shortcut \code{Ctrl+H}. For displaying results of user-defined queries, a result view is implemented which displays matches grouped per file. Additionally, it is possible to store user-defined queries and to include these queries as TPTP rules in analysis runs.

As most of the Java code for the implementation of user-defined queries is related to the user interface, it is separated into the UI plug-in \brcode{de.ugoe.cs.swe.trex.xaf.ui}. Functionalities like executing XQuery expressions and parsing results are already implemented in the plug-in \brcode{de.ugoe.cs.swe.trex.xaf} and are reused by the UI plug-in. Sharing features is also the reason to store results of XQuery expressions in \code{XafResults} objects instead of directly mapping the XQuery results to TPTP result objects: \code{XafResults} objects are also used by the implementation of user-defined queries.

The search dialogue and the result view are implemented by extending the Eclipse extension point \brcode{org.eclipse.search.searchPages} with the class \code{SearchPage} and the extension point \brcode{org.eclipse.search.searchResultViewPages} with the class \code{SearchResultPage}. The \code{SearchPage} class is mainly responsible for filling the search dialogue with \emph{Standard Widget Toolkit} (SWT) controls like a text box to enter the XQuery expression and other controls like a button to store the current query. 

The button to execute the search is already predefined by Eclipse. Clicking it invokes the \code{run} method of the XAF class \code{SearchQuery} which first calls the method \code{createXML} of the \code{Analysis} class to generate the \brcode{ttnc3.xml} input file. Afterwards the user-defined XQuery is executed using the method \code{executeXQuery} which is also located in the \code{Analysis} class and the result is displayed in the result view. If no matches are found or the XQuery expression contains syntax errors, the user is informed using a pop-up message box including an appropriate advice.

Displaying and handling the results of user-defined queries is taken over by the XAF classes \code{SearchResult} and \code{SearchResultPage}. First, the returned \code{XafResults} object is transformed into \code{FileMatches} objects (one per file) as the result view displays the results grouped by files. The \code{FileMatches} objects in turn are encapsulated in \code{Match} objects which are provided by Eclipse and required to make use of the given interfaces of the \code{SearchResult} classes. Finally, the class \code{SearchResultPage} is responsible to deliver the matches in the desired format to predefined interfaces to allow the user to navigate through a tree of matches, to click on each match, and to open a TRex editor window with the selected match.

If the user stores an XQuery expression, the query and its name are stored using the XAF class \code{PersistentPreferences}. It makes use of the Eclipse class \code{PreferenceStore} which allows to persistently store arbitrary data. The serialisation of the according object is stored in the root of the TRex workspace directory as file \brcode{xaf-user-defined-queries.store}. It is a plain text file containing key-value pairs for each user-defined query respectively its name. Beside the purpose of storing queries to enrich the set of predefined analysis functions, additionally the last executed user-defined query is stored in this file. The query is loaded from the store when the search dialogue is opened again. This allows to refine queries step-by-step without starting with an empty search dialogue every time.

Figure~\ref{impl_trex_udq} shows a screenshot of the TRex user interface for the user-defined queries. On the right side, the search dialogue is shown. On the left side, the result view is shown at the bottom of the screenshot. On the upper left side, a \ttcn{} file is opened in a TRex editor window and the found matches are highlighted using Eclipse search markers.

\begin{figure}[h]
\label{impl_trex_udq}
\centering
\includegraphics[width=1.00\textwidth]{images/trex-udq.eps}
\caption{TRex user-defined query: search dialogue, results, and markers}
\end{figure}

To be able to run analyses including a mixture of predefined and user-defined queries, it is required to map user-defined queries from the store to TPTP rules. This is done analogous to the run-time creation of predefined queries in the method \code{getOwnedElements} of the class \code{AnalysisProvider} presented in section~\ref{impl_trex}. The method \code{addUserDefinedCategoryAndRules} creates a category for all user-defined queries and iterates over the queries in the store. For each query an \code{AnalysisRule} object is created and added to the category. At the same time, an XQuery module containing all user-defined queries is created on-the-fly and stored as file \brcode{user.ttcn3.trex.xquery} with the other XAF modules. This allows analysis runs to call functions from both modules---the module containing user-defined queries and the smell detection module.

\paraskip

User-defined queries themselves can access the facade layer module and also the smell detection module. This allows to create new smell detection functions---either based on existing ones or completely new ones. The implementation of user-defined queries is in closer contact to TRex and \ttcn{} compared to the abstracted smell detection module. Hence, it is possible to give up the policy that queries should be independent of their representation formats and analysis targets. Bypassing the facade layer permits user-defined queries to directly interact with the XML representation of the AST what affords writing compact and highly specialised queries.

In the following a few sample user-defined queries are presented. Listing~\ref{impl_udq_1} shows a query which checks function names for their conformance to the \ttcn{} Naming Conventions~\cite{t3naming} proposed by the ETSI. It is recommended that function names should start with \code{f\_} and therefore the query in listing~\ref{impl_udq_1} discovers those functions whose name does not start accordingly.

\begin{lstlisting}[style=XQuery, caption=User-defined query using facade functions, label=impl_udq_1]
for $function in xfacade:get-functions(())
let $name := xfacade:get-identifier-up($function)
where not(starts-with($name, "f_"))
return $name
\end{lstlisting}

The second user-defined query in listing~\ref{impl_udq_2} fulfils the same task as the first one, but while the first query in listing~\ref{impl_udq_1} consequently makes use of the facade layer functions to access the XML document, the query in listing~\ref{impl_udq_2} directly accesses the XML document. Direct access to the XML document permits to write queries which are much shorter than the once making use of the facade layer. Additionally, it is possible to access parts of the XML document for which no corresponding facade functions are available. (See appendix~\ref{appendix_xquery_facade} for all currently available facade functions.) The drawback of such queries is evident: they completely depend on the structure of the XML document and hence on \ttcn{}. Additionally, their readability is worse than those making use of the facade functions.

\begin{lstlisting}[style=XQuery, caption=User-defined query directly accessing the XML document, label=impl_udq_2]
$xfacade:root//(FunctionDef//Identifier)[1][not(starts-with(., 'f_'))]
\end{lstlisting}

% (: Finds hidden timer declarations with the same name :)
% for $t in $xfacade:root//TimerInstance
% let $b := xfacade:get-parent-block($t)
% for $current in $b//TimerInstance
% where not($current is $t)
%   and $current//Identifier = $t//Identifier
% return $t

% (: Find hidden variable declarations (with same name and type) :)
% for $v1 in xfacade:get-vars(())
% for $v2 in xfacade:get-single-vars($v1)
% let $b := xfacade:get-parent-block($v1)
% for $t1 in xfacade:get-vars($b)
% for $t2 in xfacade:get-single-vars($t1)
% where not($v2 is $t2)
%   and $v2 << $t2
%   and $v2 = $t2
%   and xfacade:get-type($v1) = xfacade:get-type($t1)
% return ($v2, $t2)

% TODO: is this pagebreak still ok here?
\pagebreak

``Querying source code interactively [\ldots] is a critical task''~\cite{paul94sca} and a base technology regarding reverse engineering of software~\cite{kullbach1999qet}. Hence, the workflow of creating efficiently new user-defined queries for \ttcn{} source code using TRex is presented in the following.

The internal representation of the \ttcn{} source code as abstract syntax trees is based on the BNF grammar definition provided by the \ttcn{} specification~\cite{t3standard1}. For example, the naming and nesting of the nonterminal symbols of the \ttcn{} grammar is reflected by the abstract syntax tree. The \emph{AST View} provided by TRex allows to visualise the abstract syntax tree of the currently displayed file. Marking places in the source code, highlights the according passage in the AST. Combining this TRex view and the information about the \ttcn{} grammar in the specification, helps to improve the understanding of the structure of the \ttcn{} core language and its tree representation.

The XML document which is used as data structure for the user-defined queries, is nearly a one-to-one mapping of the abstract syntax tree. This eases the creation of new queries as the names in the AST view and those of the XML elements and hence those used for the XQuery path expressions are all the same. The first step in the workflow towards a new user-defined query is that the user needs to know what to query for---in terms of source code. Marking the source code of interest and looking at the AST view, reveals those elements of the syntax tree which represent this currently marked piece of source code. As just mentioned, these names are also used by the XML document and can therefore be used to formulate user-defined queries.

For instance, marking the source code \code{myTimer.start(5);} would let the user know that the corresponding XML element is called \code{StartTimerStatement} and that the query \code{xfacade:get-root()//StartTimerStatement} finds all timers on which the \code{start} function is called. Some further investigations later, the user might be able to write a user-defined function which detects all started timers that are never stopped as shown in listing~\ref{impl_udq_23}.

\begin{lstlisting}[style=XQuery, caption=User-defined query to find never stopped timers, label=impl_udq_23]
for $start in xfacade:get-root()//StartTimerStatement
let $block := xfacade:get-parent-block($start)
where not($block//StopTCStatement)
return $start
\end{lstlisting}

Still some improvements regarding user-defined queries might be reasonable. A tighter interaction between the AST view and the search dialogue of the user-defined queries is imaginable. For instance, each element of the AST view could provide a context menu with the option to start a query based on this node. This could open the search dialogue with an XQuery path expression based on the element that was clicked in the AST view. Another desireable improvement would be a live search: while typing an XQuery expression, the matched source code would be highlight on-the-fly in the currently displayed file. Both possible improvements would permit an even faster and efficient creation of user-defined queries.


\section{Tests for the Implementation} \label{impl_tests}

\abbrev{PDE}{Plug-In Development Environment}

The integration of XAF into TRex is tested using JUnit~3.8~\cite{JUnit}. Even though testing Eclipse plug-ins using JUnit is similar to the standard workflow of JUnit, the testcases must be executed as \emph{JUnit Plug-In Test} (also called \emph{Plug-In Development Environment} (PDE) JUnit Test). The main difference is that instead of ``using the standard JUnit class \code{TestRunner}, PDE JUnit tests are executed by a special test runner that launches another Eclipse instance [\ldots] and executes the test methods within that workbench.''\footnote{\url{http://wiki.eclipse.org/index.php/FAQ\_What\_is\_a\_PDE\_JUnit\_test\%3F}}

The JUnit testsuite for the XAF plug-ins is located in the plug-in \brcode{de.ugoe.cs.swe.trex.xaf.tests} and based on the testsuite for the pattern detection plug-ins~\cite[Chap.~5.5]{Bisanz06}. This is done as both projects use the UI of the TPTP static analysis framework to interact with the user and therefore the pattern detection tests are already adapted to TPTP. To make use of the existing test infrastructure, the plug-in \brcode{de.ugoe.cs.swe.trex.patterndetection.tests} was changed to be extensible and the XAF test plug-in was created on top of that.

For each testcase the following files must be provided: one or more \ttcn{} source code files, a launch configuration file which selects the TPTP rules to include in the analysis run, and an according Java \code{TestCase} subclass which controls the test execution. Every testcase makes use of the class \code{PatternDetectionTestProject} to copy the source code and launch configuration files to the expected locations, parse the source code files, create the abstract syntax trees, and launch the TPTP analysis run. Finally, it is checked that the desired matches are found by the analysis.

This testing workflow covers the following parts of TRex respectively XAF:
\begin{description}
\item[Step 1:] Starting TRex, creating a workspace and a \ttcn{} project, importing \ttcn{} source code files, and creating abstract syntax trees.
\item[Step 2:] Dynamic creation of the TPTP categories and rules of XAF based on the XML file \brcode{smells.ttcn3.xml}.
\item[Step 3:] Transformation of the abstract syntax trees to the XML representation of the AST.
\item[Step 4:] Execution of TPTP analysis runs and displaying the analysis results. This includes the following step:
	\begin{itemize}
	\item Generation of XQuery expressions based on the TPTP rule selections and execution of the XQuery expressions.
	\item Parsing of the matches returned from XQuery and transformation of the matches to TPTP results.
	\end{itemize}
\end{description}

Four JUnit test cases for different XQuery analysis functions are currently implemented. They cover different kinds of XQuery functions: one that calls Java methods of TRex from XQuery, one that returns matches in different \ttcn{} files, and two with multiple matches. Test cases for the user interface of the user-defined queries are currently not provided.


\section{Discussion} \label{impl_discussion}

Before going into the discussion of the implemented XQuery-based analysis framework, it is checked if the requirements for the framework as formulated in chapter~\ref{ch_req_and_design} are met. The general requirements R\ref{R1}--R\ref{R10} are already covered by the design and architecture of the framework as discussed in section~\ref{design_discussion}. The requirements T\ref{T1}--T\ref{T4} are specific to \ttcn{} and mainly demand a customisation of the framework towards the detection of bad smells in \ttcn{} source code and the integration into TRex. These requirements are fulfilled as presented in the preceding sections~\ref{impl_ttcn3_xml}--\ref{impl_trex} by converting \ttcn{} source code into XML, implementing the facade layer for \ttcn{} using XQuery, describing and detecting bad smells of the \ttcn{} code smell catalogue using XQuery, and integrating XAF into TRex and the static analysis framework of TPTP. The subordinate requirements U\ref{U1}--U\ref{U5} of requirement R\ref{R10} demand the possibility to add user-defined pattern and mainly concern the user interface of TRex. The according implementation is described in section~\ref{impl_udq} and fulfils all but the optional requirement U\ref{U5} which requests support to view, edit, delete, and rename stored user-defined queries. 

This leads directly to the limitations of the current implementation regarding the integration of XAF into TRex and the implementation of user-defined queries. No limitation concerns the XQuery core of XAF. Most of the following items are due to the limitations of the TPTP static analysis framework respectively the way XAF makes use of it.

\begin{itemize}
\item TPTP measures the overall time of an analysis run as sum of the times each rule takes. As all XQuery analysis functions are executed in one step, only the overall time the execution takes can be presented by TPTP instead of the time information for each rule.
\item Normally, each TPTP rule features a severity property. For dynamically added rules, like those of XAF, this severity property is not available.
\item Displaying duplicates using TPTP is not very well arranged as all duplicates of one rule appear in a flat list of matches. It is desireable to extend TPTP to allow displaying matches belonging together in separate groups.
\item TPTP allows to link so-called quick fixes to TPTP rules. These quick fixes suggest refactorings to remove the bad smell and make therefore use of the refactoring capabilities of TRex. Currently, XAF does not implement any quick fixes.
\item There is no possibility to persistently skip false positives or undesired matches for future analysis runs. This could be implemented for instance using source code comments containing special instructions to skip a match or using a database which reminds undesired matches by their position in the source code.
\end{itemize}

\noindent Regarding user-defined queries, the following limitations are known:

\begin{itemize}
\item As already mentioned, currently no user interface to view, edit, delete, and rename stored queries is implemented (requirement U\ref{U5}).
\item Unlike TPTP analysis runs, the search dialogue of the user-defined queries does not support scope-aware searches. All \ttcn{} files of the currently opened TRex projects are included in the search. As soon as a query is stored and executed as TPTP rule it uses the scope of TPTP.
\item The text box of the search dialogue where user-defined queries are entered, features no undo capability or other desireable features like syntax highlighting for XQuery.
\end{itemize}

Removing these limitations would make the framework more user-friendly and powerful, but the currently implemented functions already provide a smooth integration of the predefined XQuery functions. Additionally, user-defined queries provide a powerful way to extend the framework with new smell detection functions and other patterns. Regarding the application and performance of the framework it is referred to section~\ref{ch5_let_it_run}.


\chapter{Application and Extension} \label{ch_5}

The current chapter presents results of the application of the XQuery-based analysis framework for the detection of bad smells in existing, commercially used \ttcn{} test suites (section~\ref{ch5_let_it_run}). Additionally, this chapter 
reveals possibilities for the extension of the framework towards new analysis targets (section~\ref{new_targets}) and towards new types of patterns (section~\ref{new_patterns}) beside this thesis' focus on \ttcn{} and bad smells.

\section{Detection of Bad Smells in TTCN-3 Test Suites} \label{ch5_let_it_run}

In this section, the results of the bad smell detection in existing \ttcn{} test suites using XAF are discussed. This is done mainly for two reasons: first to check the implementation of XAF for correctness and usability. The second reason is to reveal the need for automated bad smell detection even for commercially used \ttcn{} test suites.

The following \ttcn{} test suites\footnote{\url{http://www.ttcn-3.org/PublicTTCN3TestSuites.htm}} are publicly available and used to perform the tests: Session Initiation Protocol (SIP)~\cite{etsi-sip}, Internet Protocol version~6 (IPv6)~\cite{etsi-ipv6}, and Digital Mobile Radio (DMR)~\cite{etsi-dmr}. All tests are performed using the following 32-bit software versions: Debian GNU/Linux version~4.0, Sun Java~6 SE Runtime Environment (1.6.0\_03-b05), Eclipse SDK~3.3.1 (M20070921-1145), Eclipse TPTP~4.4.0.3, \mbox{Saxon-B} 8.9.0.4, XAF~1.0.0, and the currently latest available version of TRex\footnote{Subversion branch \code{trunk}, timestamp 2007-10-17}. The test system was equipped with 2~GiB of RAM and an AMD Athlon~64 single-core processor with 2~GHz.

The results of the test runs are presented in table~\ref{analysis_results}. The first column contains the names of the analysis functions and their parameter settings. The other three columns contain the number of smell instances found for each function. Additionally, the time consumption for the complete analysis run is given in the last row. The overall time consumption includes the following steps: creation of the XML representation of the abstract syntax tree (the time required for building the AST is not included), execution of the XQuery expressions (this includes reading the XML file \code{ttcn3.xml}), and displaying the matches in the TPTP view inclusive Eclipse markers. Also the time the execution of the XQuery expressions takes (without creating the XML file and displaying the results) is given.

\begin{table}
	\centering
		\begin{center}
		\begin{tabular}{|l|rrr|}
			\hline
			Test Suite & SIP & IPv6 & DMR \\
			Version & v4.1.1 & v1.1.1 & v1.2.1 \\
			Lines of Code & 55510 & 24462 & 15485 \\
			\hline
			Avoid Goto Statement & -- & -- & -- \\
			Magic Numbers (not 0,0.0,1,1.0) & 427 & 331 & 109 \\
			Long Statement Block (120+ lines) & 58 & 4 & -- \\
			Nested Conditional & 178 & 58 & 58 \\
			Short Template (2 or less fields) & 62 & 39 & 88 \\
			Short Template (50 or less chars) & 31 & 21 & 65 \\
			Deactivation On Another Level & -- & -- & -- \\
			Missing Activation & -- & -- & -- \\
			Missing Deactivation & 686 & 367 & 326 \\
			Unreachable Default & -- & -- & -- \\
			Duplicate Alt Branches & 5 & 26 & 48 \\			
 			Duplicate Code in Conditionals & 192 & 111 & 90 \\
			Fully Parameterized Template & -- & -- & -- \\
			Long Parameter List (6+ parameters) & 108 & 8 & 2 \\
			Missing Log & 692 & 13 & 389 \\
			Missing Verdict (in alt branches) & 3 & -- & -- \\
			Stop In Function & 313 & 5 & 5 \\
			Idle PTC & -- & -- & 7 \\
			\hline
			Duration seconds (XQuery/total) & 142/170 & 34/43 & 30/36 \\
			\hline
		\end{tabular}
		\end{center}
	\caption{Bad smells found by XAF in \ttcn{} test suites}
	\label{analysis_results}
\end{table}

First of all, the duration of the analysis runs shows that the execution of the XQuery expressions took between a half minute for the DMR test suite and about two and a half minutes for the SIP test suite. As the latter consists of about 55000 lines of code while the DMR suite consists of about 15000 lines of code, it can be assessed that the framework scales well. Additional to the time the XQuery executions take, about 20\% extra time is consumed by the TRex for the creation of the input XML file and the parsing and displaying of the results. The performance can be improved by using a further optimised XQuery processor like Saxon-SA instead of Saxon-B (see section~\ref{impl_software} for details on this).

The detection results can be parted in two groups. The first one contains those analysis functions which found no smell instances at all. The reasons for this result can be various: the smell detection functions might be written in a way to miss existing smells or there are no instances of these smells in the investigated test suites. For some smell detection functions like \emph{Avoid Goto Statement} it can be assessed that they work as intended and miss no smell instances. 
Another possibility is that the functions are working correct and the smell descriptions which originate from an academic background are not existing in real-world test suites.

The second group contains those smell detection functions that returned matches. The bad smell \emph{Missing Deactivation} seem to be very common. But as defaults are automatically deactivated at the end of their lifetimes, the consequences of this smell are not drastic. The current implementation of the detection function to find instances of the smell \emph{Missing Log} might return false positive matches. This is due to the fact that log operations called in subordinate functions are currently not taken into account. The majority of the functions like \emph{Magic Numbers}, \emph{Long Statement Block}, and \emph{Duplicate Alt Branches} finds smell instances without false positives and negatives and indicate a lot of refactoring opportunities.

Only the 18~pure XQuery analysis functions of the currently 20 implemented ones are included in the test runs discussed above. The two functions calling Java methods of the TRex reference finder, are left out due to the fact that the TRex implementation of the reference finder is slow and would have adulterated the performance results. To show how large the effect of the invocations of these TRex methods is, two additional analysis runs are performed using the Java-based pattern detection of Bisanz~\cite{Bisanz06} with the DMR test suite~\cite{etsi-dmr}: the first run consists of all eleven implemented rules of the Java-based pattern detection and took 476 seconds. The second run includes only those seven rules not using the reference finder of TRex and took only 43 seconds. A similar performance loss can be observed with XAF when the reference finder of TRex is invoked from within XQuery functions. To show that the invocation of Java methods from within XQuery is not the bottleneck---but the referencing counting itself is slow---the Java-only implementation of Bisanz is chosen for this two test runs.

To compare the performance of TRex' Java-based pattern detection and XAF, four smell detection functions which are implemented by both approaches (\emph{Magic Number}, \emph{Short Template}, \emph{Duplicate Alt Branches}, and \emph{Fully Parameterized Template}) are used for analysis runs with the DMR test suite~\cite{etsi-dmr}. The analysis run with the Java-based approach takes 2.4~seconds while the XAF run takes 22.3~seconds. The reasons for this obvious difference are various: while the Java-based approach starts analysing the AST which was created before, XAF first needs to convert the AST to XML (what takes about three seconds for the DMR test suite) and parse the XML document before starting the actual analysis. Additional test runs using an XQuery debugger\footnote{\url{http://www.oxygenxml.com/xquery_debugger.html}} revealed that most of the time is spent by the duplicate search and highlighted some potential optimisation opportunities. Altogether, the performance loss seams to be the price that needs to be paid for a declarative approach.


\section{Extension of the Framework to New Analysis Targets} \label{new_targets}

To prove the extensibility of the XQuery-based analysis framework, two new analysis targets and according representation formats are integrated as experimental implementations. As additional analysis targets UML models (created with ArgoUML~\cite{ArgoUML}) and Java source code are used. They are represented using XMI (see section~\ref{sw_rep_xmi}, also created using ArgoUML) respectively srcML (see section~\ref{sw_rep_xml2}). The XQuery analysis function \brrcode{long-parameter-list} (see section~\ref{impl_xquery_smells}) is applied to \ttcn{}, UML, and Java to detect instances of the bad smell \emph{Long Parameter List} in these different analysis targets. Therefore according facade layer modules for XMI and srcML are implemented which only consist of those facade functions required for the detection of this bad smell. All required new facade functions are shown in listing~\ref{new_facade_uml} (UML/XMI) respectively listing~\ref{new_facade_java} (Java/srcML). Compared to those for \ttcn{} (see appendix~\ref{appendix_xquery_facade}) only the XML element names had to be changed to the according names of the XMI respectively srcML format.

\begin{lstlisting}[style=XQuery, caption=Facade functions for UML in XMI/ArgoUML flavour, label=new_facade_uml]
declare namespace UML = "org.omg.xmi.namespace.UML";
declare function xfacade:get-functions($n as node()*) {
	xfacade:get-node($n)//UML:Operation
};
declare function xfacade:get-parameters($n as node()*) {
	xfacade:get-node($n)//UML:BehavioralFeature.parameter/UML:Parameter
};
\end{lstlisting}

\begin{lstlisting}[style=XQuery, caption=Facade functions for Java in srcML flavour, label=new_facade_java]
declare function xfacade:get-functions($n as node()*) {
	xfacade:get-node($n)//function
};
declare function xfacade:get-parameters($n as node()*) {
	xfacade:get-node($n)//parameter_list/param
};
\end{lstlisting}

The positive results of this prototypical integration of UML models and Java source code demonstrates that the usage of XML as representation format allows a smooth extension of the framework towards new analysis targets. The application of the facade layer provides a level of abstraction that allows to write analysis functions to be independent of the underlying analysis target and representation format. This allows to implement analysis functions once and make use of them to discover pattern in different software artefacts.

\paraskip

The analysis targets presented up to now reflect static structures of source code respectively software architectures. The core framework is also adoptable for the analysis of software behaviour. Given an XML representation of software behaviour, it is possible to adopt the framework to this scenario. Potential XML formats capable of representing software behaviour are introduced in section~\ref{xml_graph}.

Patterns regarding software behaviour are different from those that can be found in static software artefacts. While switching from one representation format of static software artefacts to another requires relatively small changes, the initial creation of a facade layer for the access to a behavioural representation format requires a higher effort. This is due to the fact that the underlying representation formats differ significantly from those representing static software artefacts. Still the process is the same as for the presented facade layer for \ttcn: identify information required by the analysis layer and provide abstract access to these information using XQuery facade functions.


\section{Extension of the Framework to New Types of Patterns} \label{new_patterns}

The framework is designed to be extensible regarding new analysis target and also regarding new types of patterns. This section covers the possibility to make use of XAF for the integration of new pattern types. It provides a concrete example how the framework could be used for the calculation of source code metrics. Additionally, a possibility is discussed to adopt the framework for the recognition of test purposes in traces of test case executions.

As discussed in section~\ref{find_metrics}, source code metrics which exceed their boundary value, can indicate instances of bad smells and can therefore be used for the smell detection. But the original purpose of source code metrics is software measurement which is a stand-alone type of pattern. The XQuery-based analysis framework can be extended to the calculation of source code metrics as listing~\ref{xquery_loc} shows. The provided example calculates the simple metric \emph{Lines of Code} which counts all non-empty and non-comment lines of a source code file. Using XQuery and the facade layer of \ttcn{} is is done by iterating over all files (line~1) and returning for each file an XML element \code{lines-of-code} (line~2). Each of these elements contains the according filename as attribute (line~3) and the number of lines of code as content. The calculation of the lines of code is based on the location information contained in the XML document: the line information where each entity of the source code starts and ends are joined in one sequence (line~5) and the total number of the distinct values equals the lines of code.

\begin{lstlisting}[style=XQuery, caption=XQuery implementation of the metric \emph{Lines of Code}, label=xquery_loc]
for $file in xfacade:get-files(())
return element lines-of-code {
	xfacade:get-filename($file),
	count(distinct-values(
		(xfacade:get-lines($file), xfacade:get-line-ends($file))
	))
}
\end{lstlisting}

As the implementation indicates, XQuery can be used to express arbitrary metrics. More metrics based on XQuery, like \emph{Number of Methods} and \emph{Lack of Cohesion in Methods}, are discussed in~\cite{Eichberg07}.

\paraskip
\abbrev{TPLan}{Testing Purpose Language}

One desireable goal regarding testing is to examine if the implemented tests fulfil their purposes. Test purposes are prose descriptions of testing objectives regarding conformance testing.\footnote{\url{http://portal.etsi.org/mbs/Testing/conformance/conformance.htm\#TP}} They can be formulated using natural language, but also using more formal, structured approaches. Test purposes can be seen as patterns in the test behaviour. The goal is to retarget test purposes in the according behaviour representations of test cases. This is only possible, if the purposes are specified using a formal or semi-formal approach like the \emph{Testing Purpose Language} (TPLan)~\cite{DBLP:conf/pts/SchulzWR07}. This increases the possibility to recover the purposes from the test cases. ``TPLan has been designed to make test purpose specification more formal without inhibiting the expressive power of prose''~\cite{DBLP:conf/pts/SchulzWR07}.

Listing~\ref{tplan} shows the basic structure of TPLan test purpose specifications. Each test purpose needs an unique identifier (line~1) and a \code{with} condition (line~2) which contains specific initial condition required for the test purpose to be valid. The \code{ensure that} part (lines~3--6) contains the actual critical verdict criteria for a test in form of a stimulus and response to ensure that the requirements are met~\cite{DBLP:conf/pts/SchulzWR07}. The parts written between ``\code{<}''~and~``\code{>}'' need to be filled with predefined or user-defined entities, events, values, units, conditions, and words describing the actual purpose. The degree of prose used for this is up to the author of the test purposes.

\begin{lstlisting}[style=none,caption=Basic structure of TPLan test purpose specification,label=tplan]
TP id: <string>
with { <pre-conditions> }
ensure that {
  when { <stimulus> }
  then { <response> }
}
\end{lstlisting}

The optional requirement T\ref{T5} demands possibilities for the detection of test purposes in TTCN-3 source code or in behavioural representations of it. As currently no tool is available which allows to extract behavioural representations of \ttcn{} test suite executions (for instance traces of the execution), this requirement could not be investigated in depth. Even though it is currently not implemented, it should be possible to make use of the software analysis framework to perform analyses for the detection of test purposes, given the following prerequisites: an XML representation reflecting the behaviour of \ttcn{} test suite executions and the availability of an accordingly adopted facade layer module as described in section~\ref{new_patterns}. Additionally, the test purposes must be specified in formal way, for instance using TPLan. This would allow a transformation of the purposes into XQuery expressions and to check if they can be found in the traces of the test case execution.


\chapter{Summary and Outlook} \label{ch_summary}

This thesis presented a generic XML-based approach for software analysis and an accordingly implemented framework. The most important goal is the possibility to describe software patterns like bad smells using the declarative XML query language XQuery. Another major achievement is the independence of the patterns regarding the underlying programming languages respectively software artefacts. This combination allows to describe patterns in a generic way and on a high level of abstraction. Additionally, it enhances the readability and reusability of the patterns.

The software analysis framework is adopted towards the testing language \ttcn{} and the detection of bad smells in \ttcn{} test suites. Currently 20~functions for the detection of bad smells are implemented and the framework is integrated in the \ttcn{} tool TRex. Additionally to the smell detection, user-defined queries for TRex are implemented which permit to interactively query \ttcn{} source code. The results of the smell detection in existing test suites reveal the need for automated smell detection and highlight the usability of the framework. 

The prototypical adoption of the framework towards new types of patterns and the integration of new analysis targets gives an outlook towards additional fields of application. The presented solution is mainly focused on structural issues like the detection of bad smells and can be advanced towards related patterns like bug patterns, design defects, and design patterns. Additional research can also be done into the direction of data and control flow anomalies and the analysis of XML-based traces of dynamic software behaviour.


\clearpage
\addcontentsline{toc}{chapter}{\nomname}
\printnomenclature


\clearpage
% Adds the bibliography to the TOC
\addcontentsline{toc}{chapter}{\refname}
% Load the bibliography from file bibliography.bib
\bibliography{bibliography}
\bibliographystyle{alpha}
% Adds not quoted items manually to the bibliography
% \nocite{*}


\vspace{6mm} \noindent All URIs mentioned in this thesis have been verified on the 2007-10-24. Copies of the pages can be requested by sending an email to \code{jens@noedler.de}.


% \listoffigures
% \addcontentsline{toc}{chapter}{List of Figures}


% \renewcommand{\lstlistlistingname}{List of Listings}
% \lstlistoflistings
% \addcontentsline{toc}{chapter}{\lstlistlistingname}
% If a caption is too long, make use of the shorttext: caption={[<shorttext>]<longtext>}


% http://www-h.eng.cam.ac.uk/help/tpl/textprocessing/teTeX/latex/latex2e-html/ltx-200.html
\appendix

% more roooooom for the source code
\areaset[22mm] % additional margin left
	{13.7cm} % width
	{21.2cm} % heigth

\chapter{XQuery Source Codes} \label{appendix_xquery}

\section{XQuery Facade Functions} \label{appendix_xquery_facade}

\begin{lstlisting}[style=XQueryNoFloat, caption=, label=facade_all]
(:::::::::::::::::::::::: generic facade functions :::::::::::::::::::::::::::)

declare function xfacade:get-root()
{
	$xfacade:root
};

declare function xfacade:get-files($n as node()*)
{
	xfacade:get-node($n)//TTCN3File
};

declare function xfacade:get-modules($n as node()*)
{
	xfacade:get-node($n)//TTCN3Module
};

declare function xfacade:get-blocks($n as node()*)
{
	xfacade:get-node($n)//StatementBlock
};

declare function xfacade:get-parent-block($n as node()*)
{
	($n/ancestor::StatementBlock)[last()]
};

declare function xfacade:get-functions($n as node()*)
{
	xfacade:get-node($n)//FunctionDef
};

declare function xfacade:get-unions($n) 
{
	xfacade:get-node($n)//UnionDef
};

declare function xfacade:get-typedefs($n) 
{
	xfacade:get-node($n)//TypeDef
};

declare function xfacade:get-constdefs($n) 
{
	xfacade:get-node($n)//ConstDef
};

declare function xfacade:is-const-value($n) 
{
	$n/ancestor::ConstDef
};

declare function xfacade:get-parameters($n as node()*)
{
	xfacade:get-node($n)//FormalValuePar
};

declare function xfacade:get-statements($n as node()*)
{
	xfacade:get-node($n)//FunctionStatementOrDef
};

declare function xfacade:get-if-constructs($n as node()*)
{
	xfacade:get-node($n)//ConditionalConstruct
};

declare function xfacade:get-if-branch($n as element(ConditionalConstruct)*)
{
	$n/StatementBlock
};

declare function xfacade:get-else-branch($n as element(ConditionalConstruct)*)
{
	$n/ElseClause/StatementBlock
};

declare function xfacade:get-elseif-branch($n as element(ConditionalConstruct)*)
{
	$n/ElseIfClause/StatementBlock
};

declare function xfacade:get-gotos($n as node()*)
{
	xfacade:get-node($n)//GotoStatement
};

(: gets the next identifier that is located on the descendant axis. hence, the
   search for the identifier in the tree is towards the leaves. that's up! :)
declare function xfacade:get-identifier-up($n as node()*)
{
	(xfacade:get-node($n)//Identifier)[1]
};

(: gets the next identifier that is located on the preceding axis. hence, the
   search for the identifier in the tree is towards the root. that's down! :)
declare function xfacade:get-identifier-down($n as node()*)
{
	(xfacade:get-node($n)/preceding::Identifier)[last()]
};




declare function xfacade:get-identifiers($n as node()*)
{
	xfacade:get-node($n)//Identifier
};

declare function xfacade:get-repeat-ops($n as node()*)
{
	xfacade:get-node($n)//RepeatStatement
};

declare function xfacade:get-log-ops($n as node()*)
{
	xfacade:get-node($n)//LogStatement
};

declare function xfacade:get-integer-values($n as node()*)
{
	xfacade:get-node($n)//IntegerValue
};

declare function xfacade:get-float-values($n as node()*)
{
	xfacade:get-node($n)//FloatValue
};

declare function xfacade:get-variable-references($n as node()*)
{
	xfacade:get-node($n)//VariableRef
};

declare function xfacade:get-filename($n as node()*)
{
	(: try to get the attribute form the TTCN3File element :)
	let $filename := $n/ancestor::TTCN3File/@filename
	return
		if (exists($filename)) then
			$filename
		else
			(: if the attr was not found, search all elements for the attribute.
			   this is not done in the first place for reasons of speed. :)
			($n//@filename)[1]
};

declare function xfacade:get-line($n as node()*)
{
	$n/@line
};

declare function xfacade:get-lines($n as node()*)
{
	$n//@line
};

declare function xfacade:get-line-end($n as node()*)
{
	$n/@line-end
};

declare function xfacade:get-line-ends($n as node()*)
{
	$n//@line-end
};

declare function xfacade:get-offset($n as node()*)
{
	$n/@offset
};

declare function xfacade:get-offset-end($n as node()*)
{
	$n/@offset-end
};


(:::::::::::::::::::: TTCN-3-specific facade functions :::::::::::::::::::::::)

declare function xfacade:get-testcases($n) 
{
	xfacade:get-node($n)//TestcaseDef
};

declare function xfacade:get-external-functions($n) 
{
	xfacade:get-node($n)//ExtFunctionDef
};

declare function xfacade:get-components($n) 
{
	xfacade:get-node($n)//ComponentDef
};

declare function xfacade:get-signatures($n) 
{
	xfacade:get-node($n)//SignatureDef
};

declare function xfacade:get-altsteps($n) 
{
	xfacade:get-node($n)//AltstepDef
};

declare function xfacade:get-else-statements($n as node()*)
{
	xfacade:get-node($n)//ElseStatement
};

declare function xfacade:get-defaults($n as node()*)
{
	xfacade:get-node($n)[*//PredefinedType[. = "default"]]
};

declare function xfacade:get-activate-ops($n as node()*)
{
	xfacade:get-node($n)//ActivateOp
};

declare function xfacade:get-deactivate-ops($n as node()*)
{
	xfacade:get-node($n)//DeactivateStatement
};

declare function xfacade:get-activate-by-name($n, $name)
{
	xfacade:get-identifier-down(xfacade:get-activate-ops($n))[. = $name]
};

declare function xfacade:get-deactivate-by-name($n, $name)
{
	xfacade:get-node($n)//DeactivateStatement[VariableRef/Identifier = $name]
};

declare function xfacade:get-alt-constructs($n as node()*)
{
	xfacade:get-node($n)//AltConstruct
};

declare function xfacade:get-alt-branches($n as node()*)
{
	xfacade:get-node($n)//GuardStatement
};

declare function xfacade:get-templates($n as node()*)
{
	xfacade:get-node($n)//TemplateDef
};

declare function xfacade:is-template-field-value($n) 
{
	$n/ancestor::TemplateDef
};

declare function xfacade:get-template-inner-bodies($n as node()*)
{
	xfacade:get-node($n)/TemplateBody//TemplateBody
};

declare function xfacade:get-template-body($n as node()*)
{
	(xfacade:get-node($n)//TemplateBody)[1]
};

declare function xfacade:get-template-fields($n as node()*)
{
	xfacade:get-node($n)//FieldSpec
};

declare function xfacade:get-template-identifier($n as element(TemplateDef))
{
	$n/BaseTemplate/Identifier
};

declare function xfacade:get-identifieres-of-deactivate-ops-same-level(
	$n as node()*)
{
	$n/following-sibling::FunctionStatementOrDef/
	FunctionStatement/BehaviourStatements/DeactivateStatement//Identifier
};

declare function xfacade:get-setverdicts($n as node()*)
{
	xfacade:get-node($n)//SetLocalVerdict
};

declare function xfacade:get-verdict-type($n as node()*)
{
	xfacade:get-node($n)//VerdictTypeValue
};


declare function xfacade:verdict-is-fail($n as element(VerdictTypeValue)*)
{
	$n = "Fail"
};

declare function xfacade:verdict-is-inconc($n as element(VerdictTypeValue)*)
{
	$n = "Inconc"
};

declare function xfacade:get-runs-on($n as node()*)
{
	xfacade:get-node($n)//RunsOnSpec
};

declare function xfacade:get-timers($n as node()*)
{
	xfacade:get-node($n)//TimerInstance
};

(: returns complete variable declarations like "var integer i, j;" :)
declare function xfacade:get-vars($n as node()*)
{
	xfacade:get-node($n)//VarInstance
};

(: returns single variables like "i" of a declarations "var integer i, j;" :)
declare function xfacade:get-single-vars($n as node()*)
{
	xfacade:get-node($n)//SingleVarInstance
};

declare function xfacade:get-type($n as node()*)
{
	xfacade:get-node($n)//Type
};

declare function xfacade:get-create-ops($n as node()*)
{
	xfacade:get-node($n)//CreateOp
};

declare function xfacade:get-start-ops($n as node()*)
{
	xfacade:get-node($n)//ConfigurationStatements[. = "StartTCStatement"]
};

declare function xfacade:get-stop-ops($n as node()*)
{
	xfacade:get-node($n)//ConfigurationStatements[. = "StopTCStatement"]
};

declare function xfacade:is-start-op($n) 
{
	$n/ancestor::StartTCStatement
};
\end{lstlisting}


\pagebreak
\section{Generic Smell Detection Functions} \label{appendix_xquery_generic}

\begin{lstlisting}[style=XQueryNoFloat, caption=, label=smell_functions_1]
declare function xsmell:duplicate-code-in-conditionals(
	$compare-mode as xs:integer) as node()*
{
	for $scope in xfacade:get-blocks(())
	let $to-compare := (
		let $if := xfacade:get-if-constructs($scope)
		return (
			xfacade:get-if-branch($if) | 
			xfacade:get-elseif-branch($if) |
			xfacade:get-else-branch($if)
		)
	)

	let $duplicates := xlib:find-duplicates($to-compare, $compare-mode)
	let $duplicates := xfacade:add-filename-attributes($duplicates)
	return xlib:group-duplicates($duplicates, $compare-mode)
};


declare function xsmell:nested-conditional() as node()*
{
	for $if-construct in xfacade:get-if-constructs(())
	let $if-branch   := xfacade:get-if-branch($if-construct)
	let $else-branch := xfacade:get-else-branch($if-construct)
	(: count of the statements contained in the if/else branches :)
	let $if-count    := count(xfacade:get-statements($if-branch))
	let $else-count  := count(xfacade:get-statements($else-branch))
	(: only proceed if the if construct contains an else branch :)
	where $else-branch
	return
		(: skip if constructs where both branches contain only one statement :)
		if ($if-count = 1 and $else-count = 1) then
			()
		(: return those branches containing one statement :)
		else if ($if-count = 1) then
			$if-branch
		else if ($else-count = 1) then
			$else-branch
		else
			()
};


declare function xsmell:goto() as node()*
{
	xfacade:get-gotos(())
};


declare function xsmell:long-statement-block($floor as xs:integer) as node()*
{
	(: check not only statement blocks of functions, test cases or altsteps
	   as the smell catalog demands, but any statement block :)
	for $block in xfacade:get-blocks(())
	let $block-length := xfacade:get-line-end($block) - xfacade:get-line($block)
	where $block-length + 1 >= $floor
	return $block
};


declare function xsmell:magic-number($ignore as item()*) as node()*
{
	for $value in (xfacade:get-integer-values(()), xfacade:get-float-values(()))
	(: skip constants and template field values :)
	where not(xfacade:is-const-value($value))
	  and not(xfacade:is-template-field-value($value))
	  (: skip values found in the ignore list :)
	  and not(functx:is-value-in-sequence($value, $ignore))
	return $value
};


declare function xsmell:long-parameter-list($floor as xs:integer) as node()*
{
	(: not only functions can be parameterized but also many other constructs :)
	for $parametrizable-construct in
		(
			xfacade:get-functions(()), 
			xfacade:get-templates(()),
			xfacade:get-testcases(()),
			xfacade:get-typedefs(()),
			xfacade:get-external-functions(()),
			xfacade:get-signatures(()),
			xfacade:get-altsteps(())
		)
	let $parameters := xfacade:get-parameters($parametrizable-construct)
	let $number-of-parameters := count($parameters)
	where $number-of-parameters >= $floor
	return ($parametrizable-construct, xlib:return-value($number-of-parameters))
};
\end{lstlisting}


\section{Smell Detection Functions for TTCN-3} \label{appendix_xquery_ttcn3}

\begin{lstlisting}[style=XQueryNoFloat, caption=, label=smell_functions_2]
declare function xsmell:duplicate-alt-branches($compare-mode as xs:integer)
  as node()*
{
	for $scope in (xfacade:get-altsteps(()), xfacade:get-alt-constructs(()))
	let $to-compare := xfacade:get-alt-branches($scope)
	let $duplicates := xlib:find-duplicates($to-compare, $compare-mode)
	let $duplicates := xfacade:add-filename-attributes($duplicates)

	return xlib:group-duplicates($duplicates, $compare-mode)
};


declare function xsmell:unreachable-default() as node()*
{
	for $block in xfacade:get-blocks(())[xfacade:get-defaults(.)]
	return
		let $alt  := xfacade:get-alt-constructs($block)
		let $else := (xfacade:get-else-statements($alt))[1]
		for $activate   in xfacade:get-activate-ops($block)
		for $deactivate in xfacade:get-deactivate-ops($block)
		(: else statement must be surrounded by activate and deactivate :)
		where $activate << $else and $else << $deactivate
		(: identifieres of the activate and deactivate must be the same :)
			and xfacade:get-identifier-down($activate) eq 
				xfacade:get-identifier-up($deactivate)
		return ($activate, xlib:caused-by($else))
};
declare function xsmell:fully-parameterized-template() as node()*
{
	for $template in xfacade:get-templates(())
	let $defined := data(xfacade:get-identifiers(
		xfacade:get-parameters($template)))
	let $used := data(xfacade:get-identifiers(
		xfacade:get-template-inner-bodies($template)))
	where count($defined) > 0
	  and count($defined) = count(xfacade:get-template-fields($template))
	  and deep-equal($defined, $used)
	return $template
};


declare function xsmell:missing-deactivation() as node()*
{
	for $block in xfacade:get-blocks(())
	let $activate := xfacade:get-activate-ops($block)
	where not(xfacade:get-deactivate-ops($block))
	return $activate
};


declare function xsmell:missing-activation() as node()*
{
	for $block in xfacade:get-blocks(())
	let $deactivate := xfacade:get-deactivate-ops($block)
	where not(xfacade:get-activate-ops($block))
	return $deactivate
};


declare function xsmell:deactivation-other-level() as node()*
{
	for $block in xfacade:get-blocks(())
	for $statement in xfacade:get-statements($block)
	for $default in xfacade:get-defaults($statement)
	where $default
	return
		let $default-name := xfacade:get-identifier-up($default)
		let $activate := xfacade:get-activate-by-name($block, $default-name)
		let $deactivate := xfacade:get-deactivate-by-name($block, $default-name)
		let $deactivate-names := 
			xfacade:get-identifieres-of-deactivate-ops-same-level($default)
		(: the default must be:
		   - activated and deactivated
		   - deactivated at another level as its the declaration   :)
		where $activate 
		  and $deactivate
		  and not($deactivate-names = $default-name)
		return $deactivate
};


declare function xsmell:missing-verdict($skip-testcases as xs:boolean)
  as node()*
{
	for $testcase in xfacade:get-testcases(())
	return
		(: return test cases without any setverdict call :)
		if (not(xfacade:get-setverdicts($testcase))) then
			if ($skip-testcases) then
				()
			else
				$testcase
		else
			(: return alt guards without any setverdict call :) 
			for $altguard in xfacade:get-alt-branches($testcase)
			where not(xfacade:get-setverdicts($altguard))
			(: skip alt guards which call repeat() as no verdict is required :) 
			  and not(xfacade:get-repeat-ops($altguard))
			return $altguard
};


declare function xsmell:missing-log() as node()*
{
	for $verdict in xfacade:get-setverdicts(())
	let $verdict-type := xfacade:get-verdict-type($verdict)
	let $parent-block := xfacade:get-parent-block($verdict)
	where not(xfacade:get-log-ops($parent-block)) 
	  and (xfacade:verdict-is-fail($verdict-type)
	   or xfacade:verdict-is-inconc($verdict-type))
	return $verdict
};


declare function xsmell:stop-in-function($skip-runs-on as xs:boolean)
  as node()*
{
	for $func in xfacade:get-functions(())
	let $stop := xfacade:get-stop-ops($func)
	where $stop
	return
		if ($skip-runs-on and xfacade:get-runs-on($func)) then
			()
		else
			$stop
};


declare function xsmell:short-template($floor as xs:integer)
  as node()*
{
	for $template in xfacade:get-templates(())
	let $template-body := xfacade:get-template-body($template)
	let $number-of-chars := 
		xfacade:get-offset-end($template-body) - xfacade:get-offset($template-body)
	where $number-of-chars <= $floor
	return $template
};


declare function xsmell:short-template2($floor as xs:integer)
  as node()*
{
	for $template in xfacade:get-templates(())
	where count(xfacade:get-template-fields($template)) <= $floor
	return $template
};


declare function xsmell:idle-ptc() as node()*
{
	(: check for each variable declaration if it's of the type component :)
	for $var  in xfacade:get-vars(())
	for $comp in xfacade:get-components(())
	where xfacade:get-identifier-up($comp) = 
	      xfacade:get-identifier-up(xfacade:get-type($var))
	return
		(: get component's variable name, surrounding block, and references :)
		let $var-name := xfacade:get-identifier-up(xfacade:get-single-vars($var))
		let $block    := xfacade:get-parent-block($var)
		let $references := 
			xfacade:get-variable-references($block)
			[xfacade:get-identifier-up(.) = $var-name]
		return
			(: no references of this component found? that smells! :)
			if (empty($references)) then
				$var
			(: report an idle ptc, if the create operation is called 
			   but the start operation is never called  :)
			else if (
				xfacade:get-create-ops($block)
				[xfacade:get-identifier-down(.) = $var-name]
			and
				(every $r in $references satisfies not(xfacade:is-start-op($r)))) 
			then
				$var
			else
				()
};


declare function xsmell:singular-template-reference() as node()*
{
	for $template in xfacade:get-templates(())
	let $template-references := xfacade:get-template-references($template)
	where $template-references = 1
	return $template
};


declare function xsmell:singular-component-element-reference() as node()*
{
	for $component in xfacade:get-components(())
	where xfacade:get-component-references($component) > 1
	(: check each timer, var, and const if it referenced only once :)
	return ( 
		for $timer in xfacade:get-timers($component)
		where xfacade:get-timer-references($timer) = 1
		return $timer,
		for $var in xfacade:get-single-vars($component)
		where xfacade:get-var-references($var) = 1
		return $var,
		for $const in xfacade:get-constdefs($component)
		where xfacade:get-const-references($const) = 1
		return $const
	)
};
\end{lstlisting}


% \chapter{Contents of the CD-ROM} \label{appendix_cd}
% 
% \noindent The CD-ROM which is attached to the print version of this thesis contains:
% 
% \begin{itemize}
% \item a digital PDF version of this thesis and
% \item the source codes of TRex and the XQuery-based Analysis Framework (XAF) taken from the Subversion repository (branch \code{trunk}) of TRex which is located at \url{http://www.trex.informatik.uni-goettingen.de/svn/trex/}.
% \end{itemize}



\end{document}


