Anomaly detection Engine for Linux Logs (ADE)

XML for a Version 2 INTERVAL request

XML Results from an Analyze Request - details of each interval

The following code illustrates the XML structure  generated  by analyze for each analysis interval  for each Linux system in the model group from the collection of logs processed by analyze. The major element is the interval element, which contains information about a specific analysis interval for a specific system . The interval element also contains one interval_message element for each unique message issued during the interval. If the same message ID was issued more than once during the selected interval, the XML contains only one interval_message element for that unique message ID.

<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" 
targetNamespace="http://www.ibm.com/zAware/MelodyCoreIntervalV2"
xmlns="http://www.ibm.com/zAware/MelodyCoreIntervalV2" elementFormDefault="qualified">
<xs:element name="interval">
<xs:complexType>
<xs:sequence>
<xs:element name="version" type="xs:int" />
<xs:element name="sys_id" type="xs:string" />
<xs:element name="start_time" type="xs:dateTime" />
<xs:element name="end_time" type="xs:dateTime" />
<xs:element name="anomaly_score" type="xs:double" />
<xs:element name="model_internal_id" type="xs:int" />
<xs:element name="melody_version" type="xs:int" />
<xs:element name="gmt_offset" type="xs:string" />
<xs:element name="model_info">
<xs:complexType>
<xs:attribute name="model_creation_date" type="xs:dateTime" use="required" />
<xs:attribute name="training_period" type="xs:int" use="required" />
<xs:attribute name="interval_size_in_sec" type="xs:long" use="required" />
<xs:attribute name="analysis_group" type="xs:string" use="required" />
</xs:complexType>
</xs:element>
<xs:element name="msg_summary">
<xs:complexType>
<xs:attribute name="num_new_msg" type="xs:int" use="required" />
</xs:complexType>
</xs:element>
<xs:element name="interval_message" type="interval_message_type" maxOccurs="unbounded"
minOccurs="0" />
</xs:sequence>
</xs:complexType>
</xs:element>

<xs:complexType name="interval_message_type">
<xs:sequence>
<xs:element name="num_instances" type="xs:int" />
<xs:element name="bernoulli">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:double">
<xs:attribute name="frequency" type="xs:double" use="required" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
<xs:element name="cluster_id" type="xs:int" />
<xs:element name="periodicity">
<xs:complexType>
<xs:attribute name="status" use="required">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="IN_SYNC" />
<xs:enumeration value="NOT_IN_SYNC" />
<xs:enumeration value="NOT_PERIODIC" />
<xs:enumeration value="NEW" />
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attribute name="last_issued" type="xs:dateTime" use="optional" />
<xs:attribute name="score" type="xs:double" use="optional" />
</xs:complexType>
</xs:element>
<xs:element name="poisson">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:double">
<xs:attribute name="mean" type="xs:double" use="required" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
<xs:element name="intCont" type="xs:double" />
<xs:element name="normIntCont" type="xs:double" />
<xs:element name="anomaly" type="xs:double" />
<xs:element name="cluster_status" type="xs:string" />
<xs:element name="critical_words" type="xs:double" />
<xs:element name="text_sum" type="xs:string" />
<xs:element name="text_smp" type="xs:string" />
<xs:element name="time_vec" type="interval_time_vector_type" />
<xs:element name="active_rules" type="active_rules_type" maxOccurs="1" minOccurs="0" />
</xs:sequence>
<xs:attribute name="msg_id" type="xs:string" use="required" />
</xs:complexType>

<xs:complexType name="interval_time_vector_type">
<xs:sequence>
<xs:element name="occ" maxOccurs="unbounded" minOccurs="0" type="xs:int" />
</xs:sequence>
</xs:complexType>



<xs:complexType name="active_rules_type">
<xs:sequence>
<xs:element name="rule" type="rule_type" maxOccurs="unbounded" minOccurs="0" />
</xs:sequence>
</xs:complexType>

<xs:complexType name="rule_type">
<xs:sequence>
<xs:element name="name">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:string">
<xs:attribute name="affected_score" type="xs:boolean" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
<xs:element name="action" type="xs:string" />
</xs:sequence>
</xs:complexType>
</xs:schema>

XML element descriptions for INTERVAL output

The following list describes the major elements in the interval element.
version
An integer that identifies the version of the ADE application programming interface (API).
sys_id
A string that provides the name of the system that was specified on the INTERVAL request, and the name of the system group to which the system belongs.
start_time
Indicates the beginning of the first interval for which data is available for the specified system on the date in the INTERVAL request. The start time is indicated in the XML dateTime data type format in Coordinated Universal Time (UTC).
YYYY-MM-DDThh:mm:ss.tttZ
end_time
Indicates the beginning of the first interval after the date specified in the INTERVAL request. The end time is indicated in the XML dateTime data type format in Coordinated Universal Time (UTC).
YYYY-MM-DDThh:mm:ss.tttZ
anomaly_score
A double value that provides the anomaly score for this interval. The interval anomaly score is the percentile of the sum of each anomaly score for individual message IDs within an interval. When ADE uses priming data and current data to create a model of system behavior, a process that is called "training", ADE captures the distribution of interval anomaly scores for all intervals that are represented in the training data. ADE uses the distribution results and uses them to establish the range of values for each percentile.
The possible interval anomaly scores are:
0 through 99.4
The analysis interval contains messages and message clusters that match or exhibit relatively insignificant differences in expected behavior, as defined in the ADE  model. A score of 0 is possible because the server eliminates all expected, in-context messages from its scoring calculation. A score of 0 indicates intervals that exhibit no difference in behavior compared to the system or group model. The analysis snapshots for these analysis intervals are colored with the lightest blue shade.

Analysis intervals with scores that are greater than 0 but less than 99.5 contain some messages that are unexpected or issued out of context. Scores in this range indicate intervals that do not vary significantly from the system model. The analysis snapshots for these analysis intervals are colored with varying shades of blue.

99.5
Analysis intervals with this score contain some rarely seen, unexpected, or out-of-context messages. Generally speaking, this score indicates analysis intervals with some differences from the system or group model but do not contain messages of much diagnostic value. The analysis snapshots for these analysis intervals are colored with the darkest blue shade.
99.6 - 100
Analysis intervals with this score contain rarely seen messages (these messages appear in the model only once or twice), or many messages that are unexpected or issued out of context. This score indicates analysis intervals with more differences from the system or group model; these intervals can contain messages that might help you diagnose anomalous system behavior. The analysis snapshots for these analysis intervals are the color gold.
101
Analysis intervals with this score exhibit the most significant differences from the system or group model; these intervals contain messages that merit investigation. The analysis  assigns this score to analysis intervals that contain:
  • Unusual or unexpected messages.
  • A much higher volume of messages than expected.
model_internal_id
An integer that the ADE uses to identify this system model.
melody_version
An integer that represents the version of the analytics engine that the ADE  is using.
gmt_offset
An integer that indicates the difference in hours and minutes from Coordinated Universal Time (UTC) for the requested start time.
model_info
Provides information about the model associated with the specified system.
model_creation_date
An element that provides the date and time when ADE successfully built the most recent model of system behavior.
training_period
An integer that indicates the number of consecutive calendar days that the ADE  uses to identify the instrumentation data to include in training models.
interval_size_in_sec
An integer that indicates the number of seconds in an interval.
analysis_group
An element that provides the name of a  Linux model group in the ADE topology.
msg_summary
An element that contains summary information about messages in the interval.
num_new_msg
An integer that provides the total number of new messages issued by the system during this interval.
interval_message
The XML response contains one interval_message element for each unique message ID that was issued within the interval specified on the INTERVAL request. Each interval_message contains the following attributes for the message.
num_instances
An integer that specifies the number of times that this message was issued within this 10-minute interval.
bernoulli
A double value that indicates how frequently the message ID is issued within a sampled set of 10-minute analysis snapshots in the system model. Values range from 1 to 101:
  • A value of 1 indicates that the message is issued in almost all analysis intervals in the model.
  • A value of 100 indicates that the message is issued in almost none of the analysis intervals in the model.
  • A value of 101 indicates that this message ID has not been issued in any analysis interval in the model.
frequency
An integer that indicates the average number of analysis intervals in which the message is issued each day, according to analysis of the message data that ADE uses for training.
cluster_id
An integer that represents the identifier of the cluster to which this message belongs. When the message is not part of a recognized cluster, the cluster ID is -1.
periodicty_status
An element that indicates whether or not this message has a tendency to recur at specific times, and whether the message recurred as expected within the analysis interval. Valid values are:
NEW
ADE has not previously detected this message. .
IN_SYNC
ADE expects this message to be issued in a periodic pattern, and the message was issued as expected during the analysis interval.
NOT_IN_SYNC
ADE expects this message to be issued in a periodic pattern, but the message was not issued as expected during the analysis interval.
NOT_PERIODIC
ADE does not expect this message to be issued in a periodic pattern.
last_issued
An element that provides the UTC date and time when this message was last observed.
score
An integer that indicates how the periodicity status of this message contributed to the message anomaly score for the analysis interval. Higher scores generally indicate greater contribution to the message anomaly score
poisson
A double value that indicates how closely the message ID distribution in current data matches the Poisson distribution of that message ID in data during the training period for the system model. This value is provided only for message IDs that are not part of a cluster. The higher the poisson value, the greater the difference from expected behavior.
intCont
A double value that indicates the relative contribution of this message to the interval anomaly score for the analysis interval. This interval score is a function of the message anomaly score, the number of times that the message appears within this interval, and whether the message appeared in context.
normIntCont
A double value that indicates the normalized contribution of this message to the interval anomaly score for the analysis interval.
anomaly
A double value that indicates the rarity of this specific message ID within the selected interval. The anomaly score is a combination of the interval contribution score for this message and the rule, if any, that is in effect for this message. Higher scores indicate greater anomaly so messages with high anomaly scores are more likely to indicate a problem.
cluster_status
A string that indicates whether or not this message is part of an expected pattern of messages associated with a routine system event (for example, starting a subsystem or workload). ADE identifies and recognizes these patterns or groups, which are called "clusters", and the specific message IDs that constitute a specific cluster. When analyzing data, the ADE determines whether a specific message is expected to be issued within a specific cluster. A message that is issued out of context (without the other messages in the same cluster) might indicate a problem.
Values for cluster_status are:
New
ADE has not previously detected this message in the model. .
Unclustered
This message is not part of a defined cluster.
In context
ADE expects this message to be issued within a specific cluster, and the message was issued as expected in the analysis interval.
Out of context
ADE expects this message to be issued within a specific cluster, but the message was issued in a different context during the analysis interval.
critical_words
A double value that indicates whether the message contains specific words that indicate potential problems. Critical words include "abend", "failure", and "warning".
text_sum
A string that contains a summary of the common message text that was issued for each occurrence of the same message.
text_smp
A string that contains the full message text for the first occurrence of this message within the interval.
time_vec
The XML response contains one time_vec element for each unique message ID that was issued within the interval specified on the INTERVAL request.
occ
The XML response contains one occ element for each time that this message ID was issued within the interval specified on the INTERVAL request.
.
msg_id
A string that identifies the unique message ID.

Sample XML Output Messages within an Interval

The output:
<?xml version='1.0' encoding='UTF-8' ?> 
<?xml-stylesheet href='./xslt/AdeCoreIntervalV2.xsl' type='text/xsl' ?> 
<interval xsi:noNamespaceSchemaLocation="/xml/AdeCoreIntervalV2.xsd" xmlns="http://www.openmainframe.org/ade/AdeCoreIntervalV2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <version>2</version>
    <sys_id>sys1.openmainframe.org</sys_id>
    <start_time>2015-12-11T23:50:00.000Z</start_time>
    <end_time>2015-12-12T00:50:00.000Z</end_time>
    <anomaly_score>99.5</anomaly_score>
    <model_internal_id>1</model_internal_id>
    <ade_version>321</ade_version>
    <gmt_offset>GMT+00:00</gmt_offset>
    <model_info model_creation_date="2016-02-10T16:02:56.840Z" training_period="7" interval_size_in_sec="3600" analysis_group="default" limited_model="Yes"/>
    <msg_summary num_new_msg="8"/>
    <interval_message msg_id="sshd_47">
        <num_instances>24</num_instances>
        <bernoulli frequency="0.07058823529411765">99.70588235294117</bernoulli>
        <cluster_id>14</cluster_id>
        <periodicity status="NOT_PERIODIC" last_issued="2015-12-11T23:35:30.000Z"/>
        <poisson mean="1.4234978142529124">0.9999876239444027</poisson>
        <intCont>6.920749476374009</intCont>
        <normIntCont>6.920749476374009</normIntCont>
        <anomaly>0.9990129101373006</anomaly>
        <cluster_status>OUT_OF_CONTEXT</cluster_status>
        <critical_words>0.0</critical_words>
        <text_sum>Accepted keyboard-interactive/pam for user1 from 5dbf2b6d08477c5fc7774f590b31abd4f39be38a port * ssh2</text_sum>
        <text_smp>Accepted keyboard-interactive/pam for user1 from 5dbf2b6d08477c5fc7774f590b31abd4f39be38a port 1582 ssh2</text_smp>
        <time_vec>
            <occ>1</occ>
            <occ>31</occ>
            <occ>61</occ>
            <occ>62</occ>
            <occ>91</occ>
            <occ>92</occ>
        </time_vec>
    </interval_message>
    <interval_message msg_id="/usr/sbin/cron(notes):/home/notes/getsysdata.ksh_311">
        <num_instances>4</num_instances>
        <bernoulli frequency="0.07100591715976332">101.0</bernoulli>
        <cluster_id>-1</cluster_id>
        <periodicity status="NOT_PERIODIC" last_issued="2015-12-11T23:45:00.000Z"/>
        <poisson mean="2.0">0.0</poisson>
        <intCont>4.656066344281889</intCont>
        <normIntCont>4.656066344281889</normIntCont>
        <anomaly>0.9904962264578541</anomaly>
        <cluster_status>NEW</cluster_status>
        <critical_words>0.0</critical_words>
        <text_sum>(notes) CMD (/home/notes/getsysdata.ksh 1&gt;/home/notes/run.out 2&gt;&amp;1)</text_sum>
        <text_smp>(notes) CMD (/home/notes/getsysdata.ksh 1&gt;/home/notes/run.out 2&gt;&amp;1)</text_smp>
        <time_vec>
            <occ>20</occ>
            <occ>50</occ>
            <occ>80</occ>
            <occ>110</occ>
        </time_vec>
    </interval_message>
    <interval_message msg_id="/usr/sbin/cron(notesa):/home/notes/getdomdata.ksh_309">
        <num_instances>4</num_instances>
        <bernoulli frequency="0.07100591715976332">101.0</bernoulli>
        <cluster_id>-1</cluster_id>
        <periodicity status="NOT_PERIODIC" last_issued="2015-12-11T23:45:00.000Z"/>
        <poisson mean="2.0">0.0</poisson>
        <intCont>4.656066344281889</intCont>
        <normIntCont>4.656066344281889</normIntCont>
        <anomaly>0.9904962264578541</anomaly>
        <cluster_status>NEW</cluster_status>
        <critical_words>0.0</critical_words>
        <text_sum>(notesa) CMD (/home/notes/getdomdata.ksh 1&gt;~/doms.out 2&gt;&amp;1)</text_sum>
        <text_smp>(notesa) CMD (/home/notes/getdomdata.ksh 1&gt;~/doms.out 2&gt;&amp;1)</text_smp>
        <time_vec>
            <occ>20</occ>
            <occ>50</occ>
            <occ>80</occ>
            <occ>110</occ>
        </time_vec>
    </interval_message>
    ..........
    <interval_message msg_id="sudo(root):/etc/init.d/syslog_41">
        <num_instances>1</num_instances>
        <bernoulli frequency="23.788235294117648">1.0</bernoulli>
        <cluster_id>-1</cluster_id>
        <periodicity status="NOT_IN_SYNC" last_issued="2015-12-11T23:45:00.000Z" score="0.005970166986503796"/>
        <poisson mean="1.0">0.0</poisson>
        <intCont>-3.777176513347405</intCont>
        <normIntCont>-3.777176513347405</normIntCont>
        <anomaly>0.0</anomaly>
        <cluster_status>UNCLUSTERED</cluster_status>
        <critical_words>0.0</critical_words>
        <text_sum>    root : TTY=unknown ; PWD=/root ; USER=root ; COMMAND=/etc/init.d/syslog status</text_sum>
        <text_smp>    root : TTY=unknown ; PWD=/root ; USER=root ; COMMAND=/etc/init.d/syslog status</text_smp>
        <time_vec>
            <occ>110</occ>
        </time_vec>
    </interval_message>
</interval>