Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
TCondor.cxx
Go to the documentation of this file.
1// @(#)root/proof:$Id$
2// Author: Maarten Ballintijn 06/12/03
3
4/*************************************************************************
5 * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers. *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12//////////////////////////////////////////////////////////////////////////
13// //
14// TCondor //
15// //
16// Interface to the Condor system. TCondor provides a (partial) API for //
17// querying and controlling the Condor system, including experimental //
18// extensions like COD (computing on demand) //
19// //
20//////////////////////////////////////////////////////////////////////////
21
22#include <stdlib.h>
23
24#include "TCondor.h"
25#include "TList.h"
26#include "TSystem.h"
27#include "TObjString.h"
28#include "TRegexp.h"
29#include "TProofDebug.h"
30#include "Riostream.h"
31#include "TEnv.h"
32#include "TClass.h"
33
36
37
38////////////////////////////////////////////////////////////////////////////////
39/// Create Condor interface object. Uses Condor apps since there is no
40/// API yet.
41
42TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
43{
44 fClaims = new TList;
45
46 // Setup Condor
47
48 TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
49 if (condorHome != "") {
50 TString path = gSystem->Getenv("PATH");
51 path = condorHome + "/bin:" + path;
52 gSystem->Setenv("PATH",path);
53 }
54
55 TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
56 if (condorConf != "") {
57 gSystem->Setenv("CONDOR_CONFIG",condorConf);
58 }
59
60 char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
62
63 if (loc) {
64 fValid = kTRUE;
65 delete [] loc;
66 } else {
67 fValid = kFALSE;
68 }
69}
70
71
72////////////////////////////////////////////////////////////////////////////////
73/// Cleanup Condor interface.
74
76{
77 PDB(kCondor,1) Info("~TCondor","fState %d", fState );
78
79 if (fState != kFree) {
80 Release();
81 }
82 delete fClaims;
83}
84
85
86////////////////////////////////////////////////////////////////////////////////
87/// Print master status
88
89void TCondor::Print(Option_t * opt) const
90{
91 std::cout << "OBJ: " << IsA()->GetName()
92 << "\tPool: \"" << fPool << "\""
93 << "\tState: " << fState << std::endl;
94 fClaims->Print(opt);
95}
96
97
98////////////////////////////////////////////////////////////////////////////////
99/// Claim a VirtualMachine for PROOF usage.
100
101TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
102{
103 Int_t port = 0;
104
105 TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
106 vm, gSystem->TempDirectory(), gSystem->GetUid() );
107
108 PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
109 FILE *pipe = gSystem->OpenPipe(claimCmd, "r");
110
111 if (!pipe) {
112 SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
113 return 0;
114 }
115
116 TString claimId;
118 while (line.Gets(pipe)) {
119 PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
120
121 if (line.BeginsWith("ClaimId = \"")) {
122 line.Remove(0, line.Index("\"")+1);
123 line.Chop(); // remove trailing "
124 claimId = line;
125 PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
126 TRegexp r("[0-9]*$");
127 TString num = line(r);
128 port = 37000 + atoi(num.Data());
129 PDB(kCondor,1) Info("ClaimVM","port = %d", port);
130 }
131 }
132
133 Int_t r = gSystem->ClosePipe(pipe);
134 if (r) {
135 Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
136 return 0;
137 } else {
138 PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
139 }
140
141 TString jobad("jobad");
142 FILE *jf = gSystem->TempFileName(jobad);
143
144 if (jf == 0) return 0;
145
146 TString str(cmd);
147 str.ReplaceAll("$(Port)", Form("%d", port));
148 fputs(str, jf);
149
150 fclose(jf);
151
152 TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
153 claimId.Data(), jobad.Data() );
154
155 PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
156 pipe = gSystem->OpenPipe(activateCmd, "r");
157
158 if (!pipe) {
159 SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
160 return 0;
161 }
162
163 while (line.Gets(pipe)) {
164 PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
165 }
166
167 r = gSystem->ClosePipe(pipe);
168 if (r) {
169 Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
170 } else {
171 PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
172 }
173
174 gSystem->Unlink(jobad);
175
176 // TODO: get info at the start for all nodes ...
177 TCondorSlave *claim = new TCondorSlave;
178 claim->fClaimID = claimId;
179 TString node(vm);
180 node = node.Remove(0, node.Index("@")+1);
181 claim->fHostname = node;
182 claim->fPort = port;
183 claim->fPerfIdx = 100; //set performance index to 100 by default
184 claim->fImage = node; //set image to hostname by default
185
186 return claim;
187}
188
189
190////////////////////////////////////////////////////////////////////////////////
191/// Get the names of the virtual machines in the pool.
192/// Return a TList of TObjString or 0 in case of failure
193
195{
196 TString poolopt = fPool ? Form("-pool %s", fPool.Data()) : "";
197 TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
198
199 PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
200
201 FILE *pipe = gSystem->OpenPipe(cmd, "r");
202
203 if (!pipe) {
204 SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
205 return 0;
206 }
207
209 TList *l = new TList;
210 while (line.Gets(pipe)) {
211 PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
212 if (line != "") l->Add(new TObjString(line));
213 }
214
215 Int_t r = gSystem->ClosePipe(pipe);
216 if (r) {
217 delete l;
218 Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
219 return 0;
220 } else {
221 PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
222 }
223
224 return l;
225}
226
227
228////////////////////////////////////////////////////////////////////////////////
229/// Claim n virtual machines
230/// This function figures out the image and performance index before returning
231/// the list of condor slaves
232
233TList *TCondor::Claim(Int_t n, const char *cmd)
234{
235 if (fState != kFree) {
236 Error("Claim","not in state Free");
237 return 0;
238 }
239
240 TList *vms = GetVirtualMachines();
241 TIter next(vms);
242 TObjString *vm;
243 for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
244 TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
245 if (claim != 0) {
246 if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
247 // assume vm is gone
248 delete claim;
249 } else {
250 fClaims->Add(claim);
251 fState = kActive;
252 }
253 }
254 }
255
256 vms->Delete();
257 delete vms;
258
259 return fClaims;
260}
261
262
263////////////////////////////////////////////////////////////////////////////////
264/// Claim virtual machine with name vmname
265/// This function does not figure out the image and performance index before
266/// returning the condor slave
267
268TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
269{
270 if (fState != kFree && fState != kActive) {
271 Error("Claim","not in state Free or Active");
272 return 0;
273 }
274
275 TCondorSlave *claim = ClaimVM(vmname, cmd);
276 if (claim != 0) {
277 fClaims->Add(claim);
278 fState = kActive;
279 }
280
281 return claim;
282}
283
284
285////////////////////////////////////////////////////////////////////////////////
286/// Set the state of workers
287
289{
290 PDB(kCondor,1) Info("SetState","state: %s (%lld)",
291 state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
292 TIter next(fClaims);
293 TCondorSlave *claim;
294 while((claim = (TCondorSlave*) next()) != 0) {
295 TString cmd = Form("condor_cod %s -id '%s'",
296 state == kSuspended ? "suspend" : "resume",
297 claim->fClaimID.Data());
298
299 PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
300 FILE *pipe = gSystem->OpenPipe(cmd, "r");
301
302 if (!pipe) {
303 SysError("SetState","cannot run command: %s", cmd.Data());
304 return kFALSE;
305 }
306
308 while (line.Gets(pipe)) {
309 PDB(kCondor,3) Info("SetState","line = %s", line.Data());
310 }
311
312 Int_t r = gSystem->ClosePipe(pipe);
313 if (r) {
314 Error("SetState","command: %s returned %d", cmd.Data(), r);
315 return kFALSE;
316 } else {
317 PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
318 }
319 }
320
321 fState = state;
322 return kTRUE;
323}
324
325
326////////////////////////////////////////////////////////////////////////////////
327/// Suspend worker
328
330{
331 if (fState != kActive) {
332 Error("Suspend","not in state Active");
333 return kFALSE;
334 }
335
336 return SetState(kSuspended);
337}
338
339
340////////////////////////////////////////////////////////////////////////////////
341/// Resume worker
342
344{
345 if (fState != kSuspended) {
346 Error("Suspend","not in state Suspended");
347 return kFALSE;
348 }
349
350 return SetState(kActive);
351}
352
353
354////////////////////////////////////////////////////////////////////////////////
355/// Release worker
356
358{
359 if (fState == kFree) {
360 Error("Suspend","not in state Active or Suspended");
361 return kFALSE;
362 }
363
364 TCondorSlave *claim;
365 while((claim = (TCondorSlave*) fClaims->First()) != 0) {
366 TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
367
368 PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
369 FILE *pipe = gSystem->OpenPipe(cmd, "r");
370
371 if (!pipe) {
372 SysError("Release","cannot run command: %s", cmd.Data());
373 return kFALSE;
374 }
375
377 while (line.Gets(pipe)) {
378 PDB(kCondor,3) Info("Release","line = %s", line.Data());
379 }
380
381 Int_t r = gSystem->ClosePipe(pipe);
382 if (r) {
383 Error("Release","command: %s returned %d", cmd.Data(), r);
384 return kFALSE;
385 } else {
386 PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
387 }
388
389 fClaims->Remove(claim);
390 delete claim;
391 }
392
393 fState = kFree;
394 return kTRUE;
395}
396
397
398////////////////////////////////////////////////////////////////////////////////
399/// Get info about worker status
400
401Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
402{
403 TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
404 "-const 'Name==\"%s\"'", vm);
405
406 PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
407 FILE *pipe = gSystem->OpenPipe(cmd, "r");
408
409 if (!pipe) {
410 SysError("GetVmInfo","cannot run command: %s", cmd.Data());
411 return kFALSE;
412 }
413
415 while (line.Gets(pipe)) {
416 PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
417 if (line != "") {
418 TString amips = line(TRegexp("^[0-9]*"));
419 perfidx = atoi(amips);
420 image = line(TRegexp("[^:]+$"));
421 break;
422 }
423 }
424
425 Int_t r = gSystem->ClosePipe(pipe);
426 if (r) {
427 Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
428 return kFALSE;
429 } else {
430 PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
431 }
432
433 return kTRUE;
434}
435
436
437////////////////////////////////////////////////////////////////////////////////
438/// Get image of the worker
439
440TString TCondor::GetImage(const char *host) const
441{
442 TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
443 "FileSystemDomain", host);
444
445 PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
446
447 FILE *pipe = gSystem->OpenPipe(cmd, "r");
448
449 if (!pipe) {
450 SysError("GetImage","cannot run command: %s", cmd.Data());
451 return "";
452 }
453
454 TString image;
456 while (line.Gets(pipe)) {
457 PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
458 if (line != "") {
459 image = line(TRegexp("[^:]+$"));
460 break;
461 }
462 }
463
464 Int_t r = gSystem->ClosePipe(pipe);
465 if (r) {
466 Error("GetImage","command: %s returned %d", cmd.Data(), r);
467 return "";
468 } else {
469 PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
470 }
471
472 return image;
473}
474
475
476////////////////////////////////////////////////////////////////////////////////
477/// Print worker status
478
479void TCondorSlave::Print(Option_t * /*opt*/ ) const
480{
481 std::cout << "OBJ: " << IsA()->GetName()
482 << " " << fHostname << ":" << fPort
483 << " Perf: " << fPerfIdx
484 << " Image: " << fImage << std::endl;
485}
ROOT::R::TRInterface & r
Definition Object.C:4
const Bool_t kFALSE
Definition RtypesCore.h:101
long long Long64_t
Definition RtypesCore.h:80
const Bool_t kTRUE
Definition RtypesCore.h:100
const char Option_t
Definition RtypesCore.h:66
#define ClassImp(name)
Definition Rtypes.h:364
R__EXTERN TEnv * gEnv
Definition TEnv.h:170
#define PDB(mask, level)
Definition TProofDebug.h:56
char * Form(const char *fmt,...)
@ kExecutePermission
Definition TSystem.h:45
R__EXTERN TSystem * gSystem
Definition TSystem.h:559
virtual const char * GetName() const
Return name of this collection.
virtual void Print(Option_t *option="") const
Default print for collections, calls Print(option, 1).
void Print(Option_t *option="") const
Print worker status.
Definition TCondor.cxx:479
Int_t fPerfIdx
Definition TCondor.h:38
TString fClaimID
Definition TCondor.h:40
TString fImage
Definition TCondor.h:39
TString fHostname
Definition TCondor.h:36
Int_t fPort
Definition TCondor.h:37
Bool_t Suspend()
Suspend worker.
Definition TCondor.cxx:329
Bool_t Release()
Release worker.
Definition TCondor.cxx:357
Bool_t Resume()
Resume worker.
Definition TCondor.cxx:343
void Print(Option_t *option="") const
Print master status.
Definition TCondor.cxx:89
TList * Claim(Int_t n, const char *cmd)
Claim n virtual machines This function figures out the image and performance index before returning t...
Definition TCondor.cxx:233
TCondor(const char *pool="")
Create Condor interface object.
Definition TCondor.cxx:42
@ kActive
Definition TCondor.h:54
@ kFree
Definition TCondor.h:54
@ kSuspended
Definition TCondor.h:54
TCondorSlave * ClaimVM(const char *vm, const char *cmd)
Claim a VirtualMachine for PROOF usage.
Definition TCondor.cxx:101
virtual ~TCondor()
Cleanup Condor interface.
Definition TCondor.cxx:75
Bool_t fValid
Definition TCondor.h:58
TString GetImage(const char *host) const
Get image of the worker.
Definition TCondor.cxx:440
EState fState
Definition TCondor.h:60
Bool_t GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
Get info about worker status.
Definition TCondor.cxx:401
TList * fClaims
Definition TCondor.h:61
TList * GetVirtualMachines() const
Get the names of the virtual machines in the pool.
Definition TCondor.cxx:194
TString fPool
Definition TCondor.h:59
Bool_t SetState(EState state)
Set the state of workers.
Definition TCondor.cxx:288
virtual Int_t GetValue(const char *name, Int_t dflt) const
Returns the integer value for a resource.
Definition TEnv.cxx:491
A doubly linked list.
Definition TList.h:38
virtual void Add(TObject *obj)
Definition TList.h:81
virtual TObject * Remove(TObject *obj)
Remove object from the list.
Definition TList.cxx:822
virtual void Delete(Option_t *option="")
Remove all objects from the list AND delete all heap based objects.
Definition TList.cxx:470
virtual TObject * First() const
Return the first object in the list. Returns 0 when list is empty.
Definition TList.cxx:659
Collectable string class.
Definition TObjString.h:28
const char * GetName() const
Returns name of object.
Definition TObjString.h:38
virtual void SysError(const char *method, const char *msgfmt,...) const
Issue system error message.
Definition TObject.cxx:977
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition TObject.cxx:963
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition TObject.cxx:937
Regular expression class.
Definition TRegexp.h:31
Basic string class.
Definition TString.h:136
const char * Data() const
Definition TString.h:369
TString & ReplaceAll(const TString &s1, const TString &s2)
Definition TString.h:692
TString & Remove(Ssiz_t pos)
Definition TString.h:673
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition TString.h:639
virtual const char * Getenv(const char *env)
Get environment variable.
Definition TSystem.cxx:1663
virtual TTime Now()
Get current time in milliseconds since 0:00 Jan 1 1995.
Definition TSystem.cxx:466
virtual FILE * OpenPipe(const char *command, const char *mode)
Open a pipe.
Definition TSystem.cxx:665
virtual FILE * TempFileName(TString &base, const char *dir=nullptr)
Create a secure temporary file by appending a unique 6 letter string to base.
Definition TSystem.cxx:1497
virtual int ClosePipe(FILE *pipe)
Close the pipe.
Definition TSystem.cxx:674
virtual Int_t GetUid(const char *user=nullptr)
Returns the user's id. If user = 0, returns current user's id.
Definition TSystem.cxx:1560
virtual char * Which(const char *search, const char *file, EAccessMode mode=kFileExists)
Find location of file in a search path.
Definition TSystem.cxx:1546
virtual void Setenv(const char *name, const char *value)
Set environment variable.
Definition TSystem.cxx:1647
virtual int Unlink(const char *name)
Unlink, i.e.
Definition TSystem.cxx:1381
virtual const char * TempDirectory() const
Return a user configured or systemwide directory to create temporary files in.
Definition TSystem.cxx:1482
TLine * line
const Int_t n
Definition legend1.C:16
auto * l
Definition textangle.C:4