Telegraf: Cannot use instance wildcard in win_perf_counters

Created on 2 Jun 2017  路  12Comments  路  Source: influxdata/telegraf

Bug report

According to #1827 which was released in version 1.3 Windows multi instance processes should be reported as separate metrics, but my tests shows that it doesn't when you gather all instances (*).

Relevant telegraf.conf:

[[outputs.prometheus_client]]
  listen = "0.0.0.0:15000"

[[inputs.win_perf_counters]]
  [[inputs.win_perf_counters.object]]
    ObjectName = "Process"
    Instances = ["*"]
    Counters = [
      "% Processor Time"
    ]
    Measurement = "win_proc"

System info:

Telegraf 1.3.1
Windows 10

Steps to reproduce:

  1. Run telegraf.exe --config .\telegraf.conf --test with above config

Expected behavior:

...
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#1,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#2,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#3,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#4,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,host=DUMMY-HOST,instance=chrome#5,objectname=Process Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#6,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#7,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#7,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,host=DUMMY-HOST,instance=chrome#8,objectname=Process Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#9,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#10,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#11,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#12,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#13,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,host=DUMMY-HOST,instance=chrome#14,objectname=Process Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#15,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,objectname=Process,host=DUMMY-HOST,instance=chrome#16 Percent_Processor_Time=0 1496391871000000000
win_proc,objectname=Process,host=DUMMY-HOST,instance=chrome#17 Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#18,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#19,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome#20,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
...

Actual behavior:

...
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,host=DUMMY-HOST,instance=chrome,objectname=Process Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,host=DUMMY-HOST,instance=chrome,objectname=Process Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,host=DUMMY-HOST,instance=chrome,objectname=Process Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,objectname=Process,host=DUMMY-HOST,instance=chrome Percent_Processor_Time=0 1496391871000000000
win_proc,objectname=Process,host=DUMMY-HOST,instance=chrome Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
...

Additional info:

Side note: Using this together with the Prometheus output plugin gives one metric for process chrome for example.

If i change the configuration and specify specific instances it report metrics per process and instance as expected, i.e:

[[inputs.win_perf_counters]]
  [[inputs.win_perf_counters.object]]
    ObjectName = "Process"
    Instances = ["chrome", "chrome#1", "chrome#2"]
    Counters = [
      "% Processor Time"
    ]
    Measurement = "win_proc"
arewindows platforwindows

Most helpful comment

i will take a look at it. But i think it isn't only a quick fix. I have to think about the best solution and that will take some time

All 12 comments

Thanks for the bug report, does it work if you only have "chrome" in the Instances?

[[inputs.win_perf_counters]]
  [[inputs.win_perf_counters.object]]
    ObjectName = "Process"
    Instances = ["chrome"]
    Counters = [
      "% Processor Time"
    ]
    Measurement = "win_proc"

No it doesn't. Only one metric for the process chrome are reported.

@bullshit Do you have any idea what might be causing this?

I used my c++ test code with the \Process(*)\% Processor Time query and it seems that the phd.dll cannot handle the instance names correctly. Thats the output
instance

win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000
win_proc,instance=chrome,objectname=Process,host=DUMMY-HOST Percent_Processor_Time=0 1496391871000000000

so telegraf retrieved the instance name correctly from pdh.dll but you don't know if this instance could be e.g. chrome#1, chrome#2 and so on. The same behaviour occurs when you query only chrome instances \\Process(chrome*)\\% Processor Time

To avoid this when using * in the instance name would be to query first all instances in AddItem and then create for every instance a unique query/metric. But i'm not sure if this really works
The Peformance Montior GUI handles this the same way
Perfmon1
Perfmon2

Just wanted to jumpi in and confirm my original issue https://github.com/influxdata/telegraf/issues/1827 is not solved.

[[inputs.win_perf_counters.object]]
ObjectName = "Process"
Counters = ["% Processor Time","Pool Nonpaged Bytes","Pool Paged Bytes","Private Bytes","Working Set - Private"]
Instances = ["*"]
Measurement = "win_processes"
IncludeTotal=true

The above still results in very weird percentages for processes that have multiple instances.

image

Single instance services behave normally.

Same here. Using a wildcard on 'Instances' produces metrics only for the first instance, meanwhile, listening a few instances (es. "chrome","chrome#1","chrome#2"...etc) results in strange behavior, like n-different metrics but not dynamically changing as the new processes come up. I mean, launching telegraph with 2 chrome.exe opened produces 2 metrics but opening a third one does not produce a third metric. I'm not even sure that the connected counter were working correctly.

Many thanks!

Is there any ETA for resolution regarding this issue ?

@dudusakharovich I don't think anyone is actively working on this, fix will depend on a contribution from the community.

sure it depends, as pretty much any project here ;) Anyway it looks an important issue to fix, i would love to be able to contribute but....

i will take a look at it. But i think it isn't only a quick fix. I have to think about the best solution and that will take some time

Other related records: #1291, #3018, #2336

It looks that solving wildcard expanding (#1291) in PR #3018 also resolved this problem. Output on my Windows 10:

> win_proc,host=T480,instance=chrome#25,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#24,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#23,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#22,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#21,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#20,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=RAVBg64#1,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=RAVBg64,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=RAVCpl64,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#19,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#18,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#17,objectname=Process Percent_Processor_Time=1.108178973197937 1525982598000000000
> win_proc,host=T480,instance=svchost#15,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#16,objectname=Process Percent_Processor_Time=1.1244800090789795 1525982598000000000
> win_proc,host=T480,instance=chrome#15,objectname=Process Percent_Processor_Time=1.1322720050811768 1525982598000000000
> win_proc,host=T480,instance=chrome#14,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=aesm_service,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=jhi_service,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=svchost#14,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=svchost#13,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=LMS,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=svchost#12,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=SocketHeciServer,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=svchost#11,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=svchost#10,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=ApplicationFrameHost,objectname=Process Percent_Processor_Time=0 1525982598000000000
> win_proc,host=T480,instance=chrome#13,objectname=Process Percent_Processor_Time=0 1525982598000000000 

When looking at the state of #3031, there are, at least, a several regression problems to solve. I can work at that.

Was this page helpful?
0 / 5 - 0 ratings