ROOT  6.06/09
Reference Guide
TProofSuperMaster.cxx
Go to the documentation of this file.
1 // @(#)root/proof:$Id$
2 // Author: Fons Rademakers 13/02/97
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////////
13 // //
14 // TProofSuperMaster //
15 // //
16 // This class controls a Parallel ROOT Facility, PROOF, cluster. //
17 // It fires the slave servers, it keeps track of how many slaves are //
18 // running, it keeps track of the slaves running status, it broadcasts //
19 // messages to all slaves, it collects results, etc. //
20 // //
21 //////////////////////////////////////////////////////////////////////////
22 
23 #include "TProofSuperMaster.h"
24 #include "TString.h"
25 #include "TObjString.h"
26 #include "TError.h"
27 #include "TList.h"
28 #include "TSortedList.h"
29 #include "TSlave.h"
30 #include "TMap.h"
31 #include "TProofServ.h"
32 #include "TSocket.h"
33 #include "TMonitor.h"
34 #include "TDSet.h"
35 #include "TPluginManager.h"
36 #include "TVirtualProofPlayer.h"
37 #include "TMessage.h"
38 #include "TUrl.h"
39 #include "TProofResourcesStatic.h"
40 #include "TProofNodeInfo.h"
41 #include "TROOT.h"
42 
44 
45 ////////////////////////////////////////////////////////////////////////////////
46 /// Start super master PROOF session.
47 
48 TProofSuperMaster::TProofSuperMaster(const char *masterurl, const char *conffile,
49  const char *confdir, Int_t loglevel,
50  const char *alias, TProofMgr *mgr)
51 {
52  // Default initializations
53  InitMembers();
54 
55  // This may be needed during init
56  fManager = mgr;
57 
58  fUrl = TUrl(masterurl);
59 
60  if (!conffile || !conffile[0])
61  conffile = kPROOF_ConfFile;
62  else if (!strncasecmp(conffile, "sm:", 3))
63  conffile+=3;
64  if (!confdir || !confdir[0])
65  confdir = kPROOF_ConfDir;
66 
67  // Instance type
68  fMasterServ = kTRUE;
69  ResetBit(TProof::kIsClient);
70  SetBit(TProof::kIsMaster);
71  SetBit(TProof::kIsTopMaster);
72 
73  Init(masterurl, conffile, confdir, loglevel, alias);
74 
75  // For Final cleanup
76  gROOT->GetListOfProofs()->Add(this);
77 }
78 
79 ////////////////////////////////////////////////////////////////////////////////
80 /// Start up PROOF submasters.
81 
83 {
84  // If this is a supermaster server, find the config file and start
85  // submaster servers as specified in the config file.
86  // There is a difference in startup between a slave and a submaster
87  // in which the submaster will issue a kPROOF_LOGFILE and
88  // then a kPROOF_LOGDONE message (which must be collected)
89  // while slaves do not.
90 
91  Int_t pc = 0;
92  TList *submasterList = new TList;
93  // Get list of workers
94  if (gProofServ->GetWorkers(submasterList, pc) == TProofServ::kQueryStop) {
95  Error("StartSlaves", "getting list of submaster nodes");
96  return kFALSE;
97  }
99  if (fImage.IsNull())
100  fImage = Form("%s:%s", TUrl(gSystem->HostName()).GetHostFQDN(),
102 
103  UInt_t nSubmasters = submasterList->GetSize();
104  UInt_t nSubmastersDone = 0;
105  Int_t ord = 0;
106  TList validSubmasters;
107  TList validPairs;
108  validPairs.SetOwner();
109 
110  // Loop over all submasters and start them
111  TListIter next(submasterList);
112  TObject *to;
113  TProofNodeInfo *submaster;
114  while ((to = next())) {
115  // Get the next submaster from the list
116  submaster = (TProofNodeInfo *)to;
117  const Char_t *conffile = submaster->GetConfig();
118  const Char_t *image = submaster->GetImage();
119  const Char_t *msd = submaster->GetMsd();
120  Int_t sport = submaster->GetPort();
121  if (sport == -1)
122  sport = fUrl.GetPort();
123 
124  TString fullord = TString(gProofServ->GetOrdinal()) + "." + ((Long_t) ord);
125 
126  // create submaster server
127  TUrl u(Form("%s:%d", submaster->GetNodeName().Data(), sport));
128  // Add group info in the password firdl, if any
129  if (strlen(gProofServ->GetGroup()) > 0) {
130  // Set also the user, otherwise the password is not exported
131  if (strlen(u.GetUser()) <= 0)
132  u.SetUser(gProofServ->GetUser());
133  u.SetPasswd(gProofServ->GetGroup());
134  }
135  TSlave *slave =
136  CreateSubmaster(u.GetUrl(), fullord, image, msd);
137 
138  // Add to global list (we will add to the monitor list after
139  // finalizing the server startup)
140  Bool_t submasterOk = kTRUE;
141  fSlaves->Add(slave);
142  if (slave->IsValid()) {
143  validPairs.Add(new TPair(slave, new TObjString(conffile)));
144  } else {
145  submasterOk = kFALSE;
146  fBadSlaves->Add(slave);
147  }
148 
149  PDB(kGlobal,3)
150  Info("StartSlaves","submaster on host %s created and"
151  " added to list", submaster->GetNodeName().Data());
152 
153  // Notify opening of connection
154  nSubmastersDone++;
156  m << TString("Opening connections to submasters") << nSubmasters
157  << nSubmastersDone << submasterOk;
158  gProofServ->GetSocket()->Send(m);
159 
160  ord++;
161 
162  } // end loop over all submasters
163 
164  // Cleanup
165  SafeDelete(submasterList);
166 
167  nSubmastersDone = 0;
168 
169  // Here we finalize the server startup: in this way the bulk
170  // of remote operations are almost parallelized
171  TIter nxsc(&validPairs);
172  TPair *sc = 0;
173  while ((sc = (TPair *) nxsc())) {
174  // Finalize setup of the server
175  TSlave *sl = (TSlave *) sc->Key();
176  TObjString *cf = (TObjString *) sc->Value();
177  sl->SetupServ(TSlave::kMaster, cf->GetName());
178 
179  // Monitor good slaves
180  Bool_t submasterOk = kTRUE;
181  if (sl->IsValid()) {
182  // check protocol compatability
183  // protocol 1 is not supported anymore
184  if (fProtocol == 1) {
185  Error("StartSlaves", "master and submaster protocols"
186  " not compatible (%d and %d)",
188  submasterOk = kFALSE;
189  fBadSlaves->Add(sl);
190  } else {
191  fAllMonitor->Add(sl->GetSocket());
192  validSubmasters.Add(sl);
193  }
194  } else {
195  submasterOk = kFALSE;
196  fBadSlaves->Add(sl);
197  }
198 
199  // Notify end of startup operations
200  nSubmastersDone++;
202  m << TString("Setting up submasters") << nSubmasters
203  << nSubmastersDone << submasterOk;
204  gProofServ->GetSocket()->Send(m);
205  }
206 
207  Collect(kAll); //Get kPROOF_LOGFILE and kPROOF_LOGDONE messages
208  TIter nextSubmaster(&validSubmasters);
209  while (TSlave* sl = dynamic_cast<TSlave*>(nextSubmaster())) {
210  if (sl->GetStatus() == -99) {
211  Error("StartSlaves", "not allowed to connect to PROOF master server");
212  fBadSlaves->Add(sl);
213  continue;
214  }
215 
216  if (!sl->IsValid()) {
217  Error("StartSlaves", "failed to setup connection with PROOF master server");
218  fBadSlaves->Add(sl);
219  continue;
220  }
221  }
222 
223  return kTRUE;
224 }
225 
226 ////////////////////////////////////////////////////////////////////////////////
227 /// Process a data set (TDSet) using the specified selector (.C) file.
228 /// Entry- or event-lists should be set in the data set object using
229 /// TDSet::SetEntryList.
230 /// The return value is -1 in case of error and TSelector::GetStatus() in
231 /// in case of success.
232 
233 Long64_t TProofSuperMaster::Process(TDSet *set, const char *selector, Option_t *option,
234  Long64_t nentries, Long64_t first)
235 {
236  if (!IsValid()) return -1;
237 
238  R__ASSERT(GetPlayer());
239 
240  if (GetProgressDialog())
241  GetProgressDialog()->ExecPlugin(5, this, selector, set->GetListOfElements()->GetSize(),
242  first, nentries);
243 
244  return GetPlayer()->Process(set, selector, option, nentries, first);
245 }
246 
247 ////////////////////////////////////////////////////////////////////////////////
248 /// Validate a TDSet.
249 
251 {
252  if (dset->ElementsValid()) return;
253 
254  // We need to recheck after this
257 
258  TList msds;
259  msds.SetOwner();
260 
261  TList smholder;
262  smholder.SetOwner();
263  TList elemholder;
264  elemholder.SetOwner();
265 
266  // build nodelist with slaves and elements
267  TIter nextSubmaster(GetListOfActiveSlaves());
268  while (TSlave *sl = dynamic_cast<TSlave*>(nextSubmaster())) {
269  TList *smlist = 0;
270  TPair *p = dynamic_cast<TPair*>(msds.FindObject(sl->GetMsd()));
271  if (!p) {
272  smlist = new TList;
273  smlist->SetName(sl->GetMsd());
274 
275  smholder.Add(smlist);
276  TList *elemlist = new TSortedList(kSortDescending);
277  elemlist->SetName(TString(sl->GetMsd())+"_elem");
278  elemholder.Add(elemlist);
279  msds.Add(new TPair(smlist, elemlist));
280  } else {
281  smlist = dynamic_cast<TList*>(p->Key());
282  }
283  if (smlist) smlist->Add(sl);
284  }
285 
286  TIter nextElem(dset->GetListOfElements());
287  while (TDSetElement *elem = dynamic_cast<TDSetElement*>(nextElem())) {
288  if (elem->GetValid()) continue;
289  TPair *p = dynamic_cast<TPair*>(msds.FindObject(elem->GetMsd()));
290  if (p && p->Value()) {
291  TList *xl = dynamic_cast<TList*>(p->Value());
292  if (xl) xl->Add(elem);
293  } else {
294  Error("ValidateDSet", "no mass storage domain '%s' associated"
295  " with available submasters",
296  elem->GetMsd());
297  return;
298  }
299  }
300 
301  // send to slaves
302  TList usedsms;
303  TIter nextSM(&msds);
304  SetDSet(dset); // set dset to be validated in Collect()
305  while (TPair *msd = dynamic_cast<TPair*>(nextSM())) {
306  TList *sms = dynamic_cast<TList*>(msd->Key());
307  TList *setelements = dynamic_cast<TList*>(msd->Value());
308 
309  // distribute elements over the slaves
310  Int_t nsms = sms ? sms->GetSize() : -1;
311  Int_t nelements = setelements ? setelements->GetSize() : -1;
312  for (Int_t i=0; i<nsms; i++) {
313 
314  TDSet set(dset->GetType(), dset->GetObjName(),
315  dset->GetDirectory());
316  for (Int_t j = (i*nelements)/nsms;
317  j < ((i+1)*nelements)/nsms;
318  j++) {
319  TDSetElement *elem = setelements ?
320  dynamic_cast<TDSetElement*>(setelements->At(j)) : (TDSetElement *)0;
321  if (elem) {
322  set.Add(elem->GetFileName(), elem->GetObjName(),
323  elem->GetDirectory(), elem->GetFirst(),
324  elem->GetNum(), elem->GetMsd());
325  }
326  }
327 
328  if (set.GetListOfElements()->GetSize()>0) {
330  mesg << &set;
331 
332  TSlave *sl = dynamic_cast<TSlave*>(sms->At(i));
333  if (sl) {
334  PDB(kGlobal,1)
335  Info("ValidateDSet",
336  "Sending TDSet with %d elements to worker %s"
337  " to be validated", set.GetListOfElements()->GetSize(),
338  sl->GetOrdinal());
339  sl->GetSocket()->Send(mesg);
340  usedsms.Add(sl);
341  } else {
342  Warning("ValidateDSet", "not a TSlave object");
343  }
344  }
345  }
346  }
347 
348  PDB(kGlobal,1)
349  Info("ValidateDSet","Calling Collect");
350  Collect(&usedsms);
351  SetDSet(0);
352 }
353 
354 ////////////////////////////////////////////////////////////////////////////////
355 /// Construct a TProofPlayer object. The player string specifies which
356 /// player should be created: remote, slave, sm (supermaster) or base.
357 /// Default is sm. Socket is needed in case a slave player is created.
358 
360 {
361  if (!player)
362  player = "sm";
363 
364  SetPlayer(TVirtualProofPlayer::Create(player, this, s));
365  return GetPlayer();
366 }
367 
const char * GetName() const
Returns name of object.
Definition: TObjString.h:42
Bool_t StartSlaves(Bool_t)
Start up PROOF submasters.
const char * GetOrdinal() const
Definition: TSlave.h:135
void ValidateDSet(TDSet *dset)
Validate a TDSet.
long long Long64_t
Definition: RtypesCore.h:69
TSocket * GetSocket() const
Definition: TSlave.h:138
Bool_t IsValid() const
Definition: TProof.h:970
virtual EQueryAction GetWorkers(TList *workers, Int_t &prioritychange, Bool_t resume=kFALSE)
Get list of workers to be used from now on.
ClassImp(TSeqCollection) Int_t TSeqCollection TIter next(this)
Return index of object in collection.
const char Int_t const char TProof Int_t const char const char * msd
Definition: TXSlave.cxx:46
Collectable string class.
Definition: TObjString.h:32
const char Option_t
Definition: RtypesCore.h:62
const char * GetObjName() const
Definition: TDSet.h:229
virtual Int_t Send(const TMessage &mess)
Send a TMessage object.
Definition: TSocket.cxx:520
This class represents a WWW compatible URL.
Definition: TUrl.h:41
void SetPlayer(TVirtualProofPlayer *player)
Set a new PROOF player.
Definition: TProof.cxx:10766
Definition: TDSet.h:153
virtual void SetOwner(Bool_t enable=kTRUE)
Set whether this collection is the owner (enable==true) of its content.
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:892
TString fImage
Definition: TProof.h:601
#define R__ASSERT(e)
Definition: TError.h:98
#define gROOT
Definition: TROOT.h:340
virtual void Add(TSocket *sock, Int_t interest=kRead)
Add socket to the monitor's active list.
Definition: TMonitor.cxx:168
Basic string class.
Definition: TString.h:137
void SetDSet(TDSet *dset)
Definition: TProof.h:772
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
virtual TObject * FindObject(const char *name) const
Find an object in this list using its name.
Definition: TList.cxx:496
const char * GetGroup() const
Definition: TProofServ.h:254
Long_t ExecPlugin(int nargs, const T &...params)
virtual TObject * At(Int_t idx) const
Returns the object at position idx. Returns 0 if idx is out of range.
Definition: TList.cxx:310
Iterator of linked list.
Definition: TList.h:187
const char * GetObjName() const
Definition: TDSet.h:122
virtual Long64_t Process(TDSet *set, const char *selector, Option_t *option="", Long64_t nentries=-1, Long64_t firstentry=0)=0
Long64_t GetNum() const
Definition: TDSet.h:116
const char * Data() const
Definition: TString.h:349
const TString & GetMsd() const
const char * GetDirectory() const
Definition: TDSet.h:230
#define SafeDelete(p)
Definition: RConfig.h:436
TList * fBadSlaves
Definition: TProof.h:605
ClassImp(TProofSuperMaster) TProofSuperMaster
Start super master PROOF session.
const TString & GetConfig() const
#define PDB(mask, level)
Definition: TProofDebug.h:58
const char * ord
Definition: TXSlave.cxx:46
const char * GetMsd() const
Definition: TDSet.h:119
void Init(TClassEdit::TInterpreterLookupHelper *helper)
Definition: TClassEdit.cxx:118
TVirtualProofPlayer * GetPlayer() const
Definition: TProof.h:751
Int_t Collect(const TSlave *sl, Long_t timeout=-1, Int_t endtype=-1, Bool_t deactonfail=kFALSE)
Collect responses from slave sl.
Definition: TProof.cxx:2664
TList * GetListOfElements() const
Definition: TDSet.h:231
A sorted doubly linked list.
Definition: TSortedList.h:30
TSlave * CreateSubmaster(const char *url, const char *ord, const char *image, const char *msd, Int_t nwk=1)
Create a new TSlave of type TSlave::kMaster.
Definition: TProof.cxx:1870
TList * fSlaves
Definition: TProof.h:603
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:918
const Bool_t kSortDescending
Definition: TList.h:41
const char * GetWorkDir() const
Definition: TProofServ.h:255
Long64_t GetFirst() const
Definition: TDSet.h:114
static TVirtualProofPlayer * Create(const char *player, TProof *p, TSocket *s=0)
Create a PROOF player.
A doubly linked list.
Definition: TList.h:47
const char *const kPROOF_ConfFile
Definition: TProof.h:145
Bool_t ElementsValid()
Check if all elements are valid.
Definition: TDSet.cxx:1527
TMonitor * fAllMonitor
Definition: TProof.h:606
Int_t GetPort() const
Definition: TUrl.h:87
TSocket * GetSocket() const
Definition: TProofServ.h:269
const char * GetFileName() const
Definition: TDSet.h:113
R__EXTERN TSystem * gSystem
Definition: TSystem.h:549
const char *const kPROOF_ConfDir
Definition: TProof.h:146
TPluginHandler * GetProgressDialog() const
Definition: TProof.h:777
Long64_t Process(TDSet *set, const char *selector, Option_t *option="", Long64_t nentries=-1, Long64_t firstentry=0)
Process a data set (TDSet) using the specified selector (.C) file.
TObject * Value() const
Definition: TMap.h:125
unsigned int UInt_t
Definition: RtypesCore.h:42
TMarker * m
Definition: textangle.C:8
char * Form(const char *fmt,...)
const Int_t kPROOF_Protocol
Definition: TProof.h:143
Bool_t IsNull() const
Definition: TString.h:387
void SetName(const char *name)
Definition: TCollection.h:116
Int_t fProtocol
Definition: TProof.h:602
const char * GetUser() const
Definition: TProofServ.h:253
const char * GetImage() const
Definition: TProofServ.h:256
long Long_t
Definition: RtypesCore.h:50
Class used by TMap to store (key,value) pairs.
Definition: TMap.h:106
virtual TVirtualProofPlayer * MakePlayer(const char *player=0, TSocket *s=0)
Construct a TProofPlayer object.
TList * GetListOfActiveSlaves() const
Definition: TProof.h:758
virtual Int_t GetSize() const
Definition: TCollection.h:95
virtual const char * HostName()
Return the system's host name.
Definition: TSystem.cxx:307
TObject * Key() const
Definition: TMap.h:124
const char * GetOrdinal() const
Definition: TProofServ.h:265
int nentries
Definition: THbookFile.cxx:89
Mother of all ROOT objects.
Definition: TObject.h:58
char Char_t
Definition: RtypesCore.h:29
R__EXTERN TProofServ * gProofServ
Definition: TProofServ.h:359
virtual void Add(TObject *obj)
Definition: TList.h:81
const TString & GetNodeName() const
const char * GetType() const
Definition: TDSet.h:228
void ResetBit(UInt_t f)
Definition: TObject.h:172
virtual Bool_t IsValid() const
Definition: TSlave.h:154
Definition: TSlave.h:50
const TString & GetImage() const
const Bool_t kTRUE
Definition: Rtypes.h:91
TUrl fUrl
Definition: TProof.h:598
virtual Int_t SetupServ(Int_t stype, const char *conffile)
Init a PROOF slave object.
Definition: TSlave.cxx:181
Int_t GetPort() const
const char * GetDirectory() const
Return directory where to look for object.
Definition: TDSet.cxx:256
const char Int_t const char * image
Definition: TXSlave.cxx:46
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:904