ESGFNodeInstallation: is-enes2_statistics.txt

File is-enes2_statistics.txt, 10.9 KB (added by terryk, 7 years ago)
Line 
1IS-ENES2 Data Node Statistics
2
3Preparatory queries for the data download statistics
4Below, a script to be run on the IS-ENES2 nodes in order to get the data usage statistics is reported:
5
61)  psql esgcet -U dbsuper -c "select distinct remote_addr from esgf_node_manager.access_logging" > ip_addresses.txt
71.1) open the file ip_addresses.txt and cleanup the first two rows and the last one at the end of the file
8
92) ./geoiplookup0.2.sh > ip_addr_country.txt
10
113) Creation of a support table in the esgf_dashboard namespace
123.1) psql esgcet -U dbsuper -c "drop table esgf_dashboard.stats_support_table;"
133.2) psql esgcet -U dbsuper -c "create table esgf_dashboard.stats_support_table(ip varchar(20), country varchar(10), eunoteu varchar(10));"
14
154) psql esgcet -U dbsuper -e < ip_addr_country.txt
16
175) psql esgcet -U dbsuper -c "update esgf_dashboard.stats_support_table set eunoteu='EU' where country in ('FR','GB','SI','HR','BE','CZ','GR','NO','IT','ES','DE','FI', 'DK', 'SE', 'CH', 'RS', 'NL', 'ME', 'LU', 'PT', 'LI', 'IE', 'HU', 'EE','BG', 'BA', 'AT', 'AL', 'SK', 'RS', 'SM', 'RO', 'PL', 'MC', 'MT');"
18
196) psql esgcet -U dbsuper -c "update esgf_dashboard.stats_support_table set eunoteu='NotEU' where eunoteu is NULL;"
20
217) psql esgcet -U dbsuper -c "create table esgf_dashboard.euips as select distinct ip from esgf_dashboard.stats_support_table where eunoteu='EU';"
22
238) psql esgcet -U dbsuper -e < query_stats.txt
24
25
26Statistics queries
271. Get monthly data about all projects related to
28- number of users
29- number of successful downloads
30- number of successfully downloaded files
31- download volume (GB)
32for EU users since April 2013
33
34SELECT                                                                                                               
35  EXTRACT (YEAR FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS year,
36  EXTRACT (MONTH FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS month,
37  COUNT(*) AS downloads,                                                                                                                                     
38  COUNT(distinct url) AS files,
39  COUNT(distinct user_id_hash) AS users,
40  TO_CHAR(SUM(fixed_log.size)/1024/1024/1024, '9,999,999.99') AS gb
41FROM (                         
42  SELECT file.url, log.user_id_hash, max(log.date_fetched) AS date_fetched, max(file.size) AS size
43  FROM esgf_node_manager.access_logging AS log
44  JOIN public.file_version AS file
45  ON (log.url LIKE '%.nc' AND log.url = file.url)
46  WHERE log.success AND log.duration > 1000
47  AND log.date_fetched > 1364774400
48  AND log.remote_addr IN (
49     SELECT ip AS remote_addr
50     FROM esgf_dashboard.euips)
51  GROUP BY file.url, log.user_id_hash
52  ) AS fixed_log
53GROUP BY year, month
54ORDER BY year, month;
55
56
572. Get monthly data about all projects related to
58- number of users
59- number of successful downloads
60- number of successfully downloaded files
61- download volume (GB)
62for extra-EU users since April 2013
63
64SELECT                                                                                                               
65  EXTRACT (YEAR FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS year,
66  EXTRACT (MONTH FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS month,
67  COUNT(*) AS downloads,                                                                                                                                     
68  COUNT(distinct url) AS files,
69  COUNT(distinct user_id_hash) AS users,
70  TO_CHAR(SUM(fixed_log.size)/1024/1024/1024, '9,999,999.99') AS gb
71FROM (                         
72  SELECT file.url, log.user_id_hash, max(log.date_fetched) AS date_fetched, max(file.size) AS size
73  FROM esgf_node_manager.access_logging AS log
74  JOIN public.file_version AS file
75  ON (log.url LIKE '%.nc' AND log.url = file.url)
76  WHERE log.success AND log.duration > 1000
77  AND log.date_fetched > 1364774400
78  AND log.remote_addr NOT IN (
79     SELECT ip AS remote_addr
80     FROM esgf_dashboard.euips)
81  GROUP BY file.url, log.user_id_hash
82  ) AS fixed_log
83GROUP BY year, month
84ORDER BY year, month;
85
86
873.1 Get monthly data about a single project (e.g. cmip5) related to
88- number of users
89- number of successful downloads
90- number of successfully downloaded files
91- download volume (GB)
92for EU users since April 2013
93(version including the regexpr for log.url=file.url)
94
95SELECT                                                                                                               
96  EXTRACT (YEAR FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS year,
97  EXTRACT (MONTH FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS month,
98  COUNT(*) AS downloads,                                                                                                                                     
99  COUNT(distinct url) AS files,
100  COUNT(distinct user_id_hash) AS users,
101  TO_CHAR(SUM(fixed_log.size)/1024/1024/1024, '9,999,999.99') AS gb
102FROM (                         
103  SELECT file.url, log.user_id_hash, max(log.date_fetched) AS date_fetched, max(file.size) AS size
104  FROM esgf_node_manager.access_logging AS log
105  JOIN public.file_version AS file
106  ON (log.url LIKE '%.nc' AND LOWER(regexp_replace(log.url, E'^.*/(cmip5/.*\.nc)$', E'\\1')) = LOWER(regexp_replace(file.url, E'^.*(cmip5/.*\.nc)$', E'\\1')))
107  WHERE log.success AND log.duration > 1000
108  AND log.date_fetched > 1364774400
109  AND log.remote_addr IN (
110     SELECT ip AS remote_addr
111     FROM esgf_dashboard.euips)
112  GROUP BY file.url, log.user_id_hash
113  ) AS fixed_log
114GROUP BY year, month
115ORDER BY year, month;
116
117
1183.2 Get monthly data about a single project (e.g. cmip5) related to
119- number of users
120- number of successful downloads
121- number of successfully downloaded files
122- download volume (GB)
123for EU users since April 2013
124(this is a new version without the regexpr for log.url=file.url. It would be great if you could test it and provide us a feedback with regard to the 3.1 query)
125
126SELECT                                                                                                               
127  EXTRACT (YEAR FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS year,
128  EXTRACT (MONTH FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS month,
129  COUNT(*) AS downloads,                                                                                                                                     
130  COUNT(distinct url) AS files,
131  COUNT(distinct user_id_hash) AS users,
132  TO_CHAR(SUM(fixed_log.size)/1024/1024/1024, '9,999,999.99') AS gb
133FROM (                         
134  SELECT file.url, log.user_id_hash, max(log.date_fetched) AS date_fetched, max(file.size) AS size
135  FROM esgf_node_manager.access_logging AS log
136  JOIN public.file_version AS file
137  ON (log.url LIKE '%cmip5%.nc' AND log.url = file.url)
138  WHERE log.success AND log.duration > 1000
139  AND log.date_fetched > 1364774400
140  AND log.remote_addr IN (
141     SELECT ip AS remote_addr
142     FROM esgf_dashboard.euips)
143  GROUP BY file.url, log.user_id_hash
144  ) AS fixed_log
145GROUP BY year, month
146ORDER BY year, month;
147
148
1494.1 Get monthly data about a single project (e.g. cmip5) related to
150- number of users
151- number of successful downloads
152- number of successfully downloaded files
153- download volume (GB)
154for extra-EU users since April 2013
155(version including the regexpr for log.url=file.url)
156
157SELECT                                                                                                               
158  EXTRACT (YEAR FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS year,
159  EXTRACT (MONTH FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS month,
160  COUNT(*) AS downloads,                                                                                                                                     
161  COUNT(distinct url) AS files,
162  COUNT(distinct user_id_hash) AS users,
163  TO_CHAR(SUM(fixed_log.size)/1024/1024/1024, '9,999,999.99') AS gb
164FROM (                         
165  SELECT file.url, log.user_id_hash, max(log.date_fetched) AS date_fetched, max(file.size) AS size
166  FROM esgf_node_manager.access_logging AS log
167  JOIN public.file_version AS file
168  ON (log.url LIKE '%.nc' AND LOWER(regexp_replace(log.url, E'^.*/(cmip5/.*\.nc)$', E'\\1')) = LOWER(regexp_replace(file.url, E'^.*(cmip5/.*\.nc)$', E'\\1')))
169  WHERE log.success AND log.duration > 1000
170  AND log.date_fetched > 1364774400
171  AND log.remote_addr NOT IN (
172     SELECT ip AS remote_addr
173     FROM esgf_dashboard.euips)
174  GROUP BY file.url, log.user_id_hash
175  ) AS fixed_log
176GROUP BY year, month
177ORDER BY year, month;
178
179
1804.2 Get monthly data about a single project (e.g. cmip5) related to
181- number of users
182- number of successful downloads
183- number of successfully downloaded files
184- download volume (GB)
185for extra-EU users since April 2013
186(this is a new version without the regexpr for log.url=file.url. It would be great if you could test it and provide us a feedback with regard to the 4.1 query)
187
188SELECT                                                                                                               
189  EXTRACT (YEAR FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS year,
190  EXTRACT (MONTH FROM (TIMESTAMP WITH TIME ZONE 'epoch' + fixed_log.date_fetched * INTERVAL '1 second')) AS month,
191  COUNT(*) AS downloads,                                                                                                                                     
192  COUNT(distinct url) AS files,
193  COUNT(distinct user_id_hash) AS users,
194  TO_CHAR(SUM(fixed_log.size)/1024/1024/1024, '9,999,999.99') AS gb
195FROM (                         
196  SELECT file.url, log.user_id_hash, max(log.date_fetched) AS date_fetched, max(file.size) AS size
197  FROM esgf_node_manager.access_logging AS log
198  JOIN public.file_version AS file
199  ON (log.url LIKE '%cmip5%.nc' AND log.url = file.url)
200  WHERE log.success AND log.duration > 1000
201  AND log.date_fetched > 1364774400
202  AND log.remote_addr NOT IN (
203     SELECT ip AS remote_addr
204     FROM esgf_dashboard.euips)
205  GROUP BY file.url, log.user_id_hash
206  ) AS fixed_log
207GROUP BY year, month
208ORDER BY year, month;
209
210
211
212Other useful statistics
2135. Get distinct remote locations (by remote address) who have performed at least 1 download from the data node
214select distinct(remote_addr) from esgf_node_manager.access_logging;
215
216
2176. Get the number of distinct users (by remote address) who have performed at least 1 download from the data node
218select count(distinct(remote_addr)) from esgf_node_manager.access_logging;
219
220
2217. Get the number of distinct users (by user id hash) who have performed at least 1 download from the data node
222select count(distinct(user_id_hash)) from esgf_node_manager.access_logging;
223
224
2258. Get distinct user idp nodes associated to the data downloads
226select distinct(user_idp) from esgf_node_manager.access_logging;
227
228